Upload 15 files

Browse files

Files changed (15) hide show

README.md +52 -3
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
added_tokens.json +40 -0
all_results.json +14 -0
config.json +30 -0
eval_results.json +8 -0
merges.txt +0 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +326 -0
train_results.json +9 -0
trainer_state.json +2222 -0
training_args.bin +3 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,52 @@
----
-license: apache-2.0
----

+---
+license: mit
+library_name: peft
+tags:
+- alignment-handbook
+- generated_from_trainer
+base_model: microsoft/phi-1_5
+datasets:
+- Ritvik19/open-hermes-2_5-reformatted
+model-index:
+- name: openhermes-phi-1_5-sft-qlora
+  results: []
+---
+**Note**: This model card has been generated automatically according to the information the Trainer had access to.
+Visit the [model card](https://ritvik19.github.io/small-llms/) to see the full description.
+# openhermes-phi-1_5-sft-qlora
+This model is a fine-tuned version of [microsoft/phi-1_5](https://huggingface.co/microsoft/phi-1_5) on the Ritvik19/open-hermes-2_5-reformatted dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.4192
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- gradient_accumulation_steps: 128
+- total_train_batch_size: 128
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3662        | 0.9994 | 1559 | 1.4192          |
+### Framework versions
+- PEFT 0.7.1
+- Transformers 4.40.1
+- Pytorch 2.1.2+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/phi-1_5",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e10b68aad77bff7fd7a087790709cf46c9f1f701e63ec1134308eff7fc084fe
+size 9456424

added_tokens.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "\t\t": 50294,
+  "\t\t\t": 50293,
+  "\t\t\t\t": 50292,
+  "\t\t\t\t\t": 50291,
+  "\t\t\t\t\t\t": 50290,
+  "\t\t\t\t\t\t\t": 50289,
+  "\t\t\t\t\t\t\t\t": 50288,
+  "\t\t\t\t\t\t\t\t\t": 50287,
+  "  ": 50286,
+  "   ": 50285,
+  "    ": 50284,
+  "     ": 50283,
+  "      ": 50282,
+  "       ": 50281,
+  "        ": 50280,
+  "         ": 50279,
+  "          ": 50278,
+  "           ": 50277,
+  "            ": 50276,
+  "             ": 50275,
+  "              ": 50274,
+  "               ": 50273,
+  "                ": 50272,
+  "                 ": 50271,
+  "                  ": 50270,
+  "                   ": 50269,
+  "                    ": 50268,
+  "                     ": 50267,
+  "                      ": 50266,
+  "                       ": 50265,
+  "                        ": 50264,
+  "                         ": 50263,
+  "                          ": 50262,
+  "                           ": 50261,
+  "                            ": 50260,
+  "                             ": 50259,
+  "                              ": 50258,
+  "                               ": 50257
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "epoch": 0.9993739889922224,
+    "eval_loss": 1.4191993474960327,
+    "eval_runtime": 1934.5604,
+    "eval_samples": 23109,
+    "eval_samples_per_second": 7.315,
+    "eval_steps_per_second": 7.315,
+    "total_flos": 3.232184148701479e+18,
+    "train_loss": 0.016414370117774753,
+    "train_runtime": 2971.8566,
+    "train_samples": 1001538,
+    "train_samples_per_second": 67.189,
+    "train_steps_per_second": 0.525
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "microsoft/phi-1_5",
+  "architectures": [
+    "PhiForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "embd_pdrop": 0.0,
+  "eos_token_id": null,
+  "hidden_act": "gelu_new",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "phi",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 32,
+  "partial_rotary_factor": 0.5,
+  "qk_layernorm": false,
+  "resid_pdrop": 0.0,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 51200
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 0.9993739889922224,
+    "eval_loss": 1.4191993474960327,
+    "eval_runtime": 1934.5604,
+    "eval_samples": 23109,
+    "eval_samples_per_second": 7.315,
+    "eval_steps_per_second": 7.315
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,326 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "                               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "                         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50280": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50281": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50282": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50283": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50284": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50285": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50286": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50287": {
+      "content": "\t\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50288": {
+      "content": "\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50289": {
+      "content": "\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50290": {
+      "content": "\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50291": {
+      "content": "\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50292": {
+      "content": "\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50293": {
+      "content": "\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50294": {
+      "content": "\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 2048,
+  "pad_token": "<|endoftext|>",
+  "return_token_type_ids": false,
+  "tokenizer_class": "CodeGenTokenizer",
+  "unk_token": "<|endoftext|>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9993739889922224,
+    "total_flos": 3.232184148701479e+18,
+    "train_loss": 0.016414370117774753,
+    "train_runtime": 2971.8566,
+    "train_samples": 1001538,
+    "train_samples_per_second": 67.189,
+    "train_steps_per_second": 0.525
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2222 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9993739889922224,
+  "eval_steps": 500,
+  "global_step": 1559,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0006410352719642222,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 1.282051282051282e-06,
+      "loss": 1.8493,
+      "step": 1
+    },
+    {
+      "epoch": 0.003205176359821111,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 6.41025641025641e-06,
+      "loss": 1.8865,
+      "step": 5
+    },
+    {
+      "epoch": 0.006410352719642222,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 1.8383,
+      "step": 10
+    },
+    {
+      "epoch": 0.009615529079463333,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 1.8385,
+      "step": 15
+    },
+    {
+      "epoch": 0.012820705439284444,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 1.8346,
+      "step": 20
+    },
+    {
+      "epoch": 0.016025881799105555,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 3.205128205128206e-05,
+      "loss": 1.8127,
+      "step": 25
+    },
+    {
+      "epoch": 0.019231058158926666,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 1.7981,
+      "step": 30
+    },
+    {
+      "epoch": 0.022436234518747777,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 4.4871794871794874e-05,
+      "loss": 1.7907,
+      "step": 35
+    },
+    {
+      "epoch": 0.025641410878568888,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 5.128205128205128e-05,
+      "loss": 1.7468,
+      "step": 40
+    },
+    {
+      "epoch": 0.02884658723839,
+      "grad_norm": 0.1328125,
+      "learning_rate": 5.769230769230769e-05,
+      "loss": 1.7105,
+      "step": 45
+    },
+    {
+      "epoch": 0.03205176359821111,
+      "grad_norm": 0.126953125,
+      "learning_rate": 6.410256410256412e-05,
+      "loss": 1.6887,
+      "step": 50
+    },
+    {
+      "epoch": 0.035256939958032224,
+      "grad_norm": 0.107421875,
+      "learning_rate": 7.051282051282052e-05,
+      "loss": 1.6757,
+      "step": 55
+    },
+    {
+      "epoch": 0.03846211631785333,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 7.692307692307693e-05,
+      "loss": 1.6736,
+      "step": 60
+    },
+    {
+      "epoch": 0.041667292677674446,
+      "grad_norm": 0.078125,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 1.6252,
+      "step": 65
+    },
+    {
+      "epoch": 0.04487246903749555,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 8.974358974358975e-05,
+      "loss": 1.5655,
+      "step": 70
+    },
+    {
+      "epoch": 0.04807764539731667,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 9.615384615384617e-05,
+      "loss": 1.5646,
+      "step": 75
+    },
+    {
+      "epoch": 0.051282821757137775,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.00010256410256410256,
+      "loss": 1.5861,
+      "step": 80
+    },
+    {
+      "epoch": 0.05448799811695889,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.00010897435897435896,
+      "loss": 1.5379,
+      "step": 85
+    },
+    {
+      "epoch": 0.05769317447678,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 1.5557,
+      "step": 90
+    },
+    {
+      "epoch": 0.06089835083660111,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.00012179487179487179,
+      "loss": 1.5102,
+      "step": 95
+    },
+    {
+      "epoch": 0.06410352719642222,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.00012820512820512823,
+      "loss": 1.5048,
+      "step": 100
+    },
+    {
+      "epoch": 0.06730870355624333,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.00013461538461538464,
+      "loss": 1.5127,
+      "step": 105
+    },
+    {
+      "epoch": 0.07051387991606445,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.00014102564102564104,
+      "loss": 1.5161,
+      "step": 110
+    },
+    {
+      "epoch": 0.07371905627588556,
+      "grad_norm": 0.0301513671875,
+      "learning_rate": 0.00014743589743589745,
+      "loss": 1.4948,
+      "step": 115
+    },
+    {
+      "epoch": 0.07692423263570666,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.00015384615384615385,
+      "loss": 1.4584,
+      "step": 120
+    },
+    {
+      "epoch": 0.08012940899552777,
+      "grad_norm": 0.029052734375,
+      "learning_rate": 0.00016025641025641028,
+      "loss": 1.4704,
+      "step": 125
+    },
+    {
+      "epoch": 0.08333458535534889,
+      "grad_norm": 0.0279541015625,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.4411,
+      "step": 130
+    },
+    {
+      "epoch": 0.08653976171517,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001730769230769231,
+      "loss": 1.4723,
+      "step": 135
+    },
+    {
+      "epoch": 0.0897449380749911,
+      "grad_norm": 0.02685546875,
+      "learning_rate": 0.0001794871794871795,
+      "loss": 1.4505,
+      "step": 140
+    },
+    {
+      "epoch": 0.09295011443481223,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001858974358974359,
+      "loss": 1.4367,
+      "step": 145
+    },
+    {
+      "epoch": 0.09615529079463334,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.00019230769230769233,
+      "loss": 1.4291,
+      "step": 150
+    },
+    {
+      "epoch": 0.09936046715445444,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.00019871794871794874,
+      "loss": 1.4075,
+      "step": 155
+    },
+    {
+      "epoch": 0.10256564351427555,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.00019999598882613538,
+      "loss": 1.4203,
+      "step": 160
+    },
+    {
+      "epoch": 0.10577081987409667,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.00019997969398381457,
+      "loss": 1.4188,
+      "step": 165
+    },
+    {
+      "epoch": 0.10897599623391778,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.00019995086681563726,
+      "loss": 1.4512,
+      "step": 170
+    },
+    {
+      "epoch": 0.11218117259373889,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.0001999095109350519,
+      "loss": 1.417,
+      "step": 175
+    },
+    {
+      "epoch": 0.11538634895356,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001998556315259648,
+      "loss": 1.4309,
+      "step": 180
+    },
+    {
+      "epoch": 0.11859152531338112,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.00019978923534209054,
+      "loss": 1.4201,
+      "step": 185
+    },
+    {
+      "epoch": 0.12179670167320222,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.00019971033070610518,
+      "loss": 1.4187,
+      "step": 190
+    },
+    {
+      "epoch": 0.12500187803302334,
+      "grad_norm": 0.030517578125,
+      "learning_rate": 0.0001996189275086033,
+      "loss": 1.4153,
+      "step": 195
+    },
+    {
+      "epoch": 0.12820705439284444,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.00019951503720685784,
+      "loss": 1.4279,
+      "step": 200
+    },
+    {
+      "epoch": 0.13141223075266556,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001993986728233844,
+      "loss": 1.4052,
+      "step": 205
+    },
+    {
+      "epoch": 0.13461740711248665,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001992698489443085,
+      "loss": 1.3943,
+      "step": 210
+    },
+    {
+      "epoch": 0.13782258347230777,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001991285817175375,
+      "loss": 1.3931,
+      "step": 215
+    },
+    {
+      "epoch": 0.1410277598321289,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001989748888507363,
+      "loss": 1.3931,
+      "step": 220
+    },
+    {
+      "epoch": 0.14423293619195,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.00019880878960910772,
+      "loss": 1.3899,
+      "step": 225
+    },
+    {
+      "epoch": 0.1474381125517711,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001986303048129778,
+      "loss": 1.4305,
+      "step": 230
+    },
+    {
+      "epoch": 0.15064328891159223,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001984394568351858,
+      "loss": 1.4028,
+      "step": 235
+    },
+    {
+      "epoch": 0.15384846527141333,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.00019823626959827997,
+      "loss": 1.3758,
+      "step": 240
+    },
+    {
+      "epoch": 0.15705364163123445,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.0001980207685715186,
+      "loss": 1.407,
+      "step": 245
+    },
+    {
+      "epoch": 0.16025881799105554,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.00019779298076767795,
+      "loss": 1.3923,
+      "step": 250
+    },
+    {
+      "epoch": 0.16346399435087666,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.00019755293473966572,
+      "loss": 1.3967,
+      "step": 255
+    },
+    {
+      "epoch": 0.16666917071069778,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.00019730066057694235,
+      "loss": 1.4007,
+      "step": 260
+    },
+    {
+      "epoch": 0.16987434707051888,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00019703618990174918,
+      "loss": 1.3978,
+      "step": 265
+    },
+    {
+      "epoch": 0.17307952343034,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.00019675955586514468,
+      "loss": 1.3744,
+      "step": 270
+    },
+    {
+      "epoch": 0.17628469979016112,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.00019647079314284897,
+      "loss": 1.3929,
+      "step": 275
+    },
+    {
+      "epoch": 0.1794898761499822,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001961699379308974,
+      "loss": 1.4031,
+      "step": 280
+    },
+    {
+      "epoch": 0.18269505250980334,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001958570279411032,
+      "loss": 1.3813,
+      "step": 285
+    },
+    {
+      "epoch": 0.18590022886962446,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.00019553210239633056,
+      "loss": 1.3956,
+      "step": 290
+    },
+    {
+      "epoch": 0.18910540522944555,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.00019519520202557797,
+      "loss": 1.3988,
+      "step": 295
+    },
+    {
+      "epoch": 0.19231058158926667,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.00019484636905887296,
+      "loss": 1.3925,
+      "step": 300
+    },
+    {
+      "epoch": 0.19551575794908777,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.00019448564722197853,
+      "loss": 1.376,
+      "step": 305
+    },
+    {
+      "epoch": 0.1987209343089089,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.00019411308173091228,
+      "loss": 1.3974,
+      "step": 310
+    },
+    {
+      "epoch": 0.20192611066873,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001937287192862787,
+      "loss": 1.3765,
+      "step": 315
+    },
+    {
+      "epoch": 0.2051312870285511,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.00019333260806741502,
+      "loss": 1.3769,
+      "step": 320
+    },
+    {
+      "epoch": 0.20833646338837222,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.00019292479772635237,
+      "loss": 1.3792,
+      "step": 325
+    },
+    {
+      "epoch": 0.21154163974819334,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00019250533938159166,
+      "loss": 1.3968,
+      "step": 330
+    },
+    {
+      "epoch": 0.21474681610801444,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.00019207428561169608,
+      "loss": 1.38,
+      "step": 335
+    },
+    {
+      "epoch": 0.21795199246783556,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001916316904487005,
+      "loss": 1.3737,
+      "step": 340
+    },
+    {
+      "epoch": 0.22115716882765665,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.00019117760937133844,
+      "loss": 1.4065,
+      "step": 345
+    },
+    {
+      "epoch": 0.22436234518747777,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.00019071209929808806,
+      "loss": 1.4012,
+      "step": 350
+    },
+    {
+      "epoch": 0.2275675215472989,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.00019023521858003742,
+      "loss": 1.3941,
+      "step": 355
+    },
+    {
+      "epoch": 0.23077269790712,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.00018974702699357029,
+      "loss": 1.4072,
+      "step": 360
+    },
+    {
+      "epoch": 0.2339778742669411,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.00018924758573287315,
+      "loss": 1.3531,
+      "step": 365
+    },
+    {
+      "epoch": 0.23718305062676223,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.00018873695740226468,
+      "loss": 1.3682,
+      "step": 370
+    },
+    {
+      "epoch": 0.24038822698658333,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001882152060083484,
+      "loss": 1.3796,
+      "step": 375
+    },
+    {
+      "epoch": 0.24359340334640445,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.00018768239695198945,
+      "loss": 1.3835,
+      "step": 380
+    },
+    {
+      "epoch": 0.24679857970622554,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001871385970201168,
+      "loss": 1.3678,
+      "step": 385
+    },
+    {
+      "epoch": 0.2500037560660467,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.00018658387437735135,
+      "loss": 1.3778,
+      "step": 390
+    },
+    {
+      "epoch": 0.2532089324258678,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.00018601829855746185,
+      "loss": 1.3811,
+      "step": 395
+    },
+    {
+      "epoch": 0.2564141087856889,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 1.3851,
+      "step": 400
+    },
+    {
+      "epoch": 0.25961928514551,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001848548723146581,
+      "loss": 1.3865,
+      "step": 405
+    },
+    {
+      "epoch": 0.2628244615053311,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.00018425716772572473,
+      "loss": 1.3638,
+      "step": 410
+    },
+    {
+      "epoch": 0.2660296378651522,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.00018364890160934904,
+      "loss": 1.3918,
+      "step": 415
+    },
+    {
+      "epoch": 0.2692348142249733,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.00018303015021090525,
+      "loss": 1.3794,
+      "step": 420
+    },
+    {
+      "epoch": 0.27243999058479446,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.00018240099109008412,
+      "loss": 1.3836,
+      "step": 425
+    },
+    {
+      "epoch": 0.27564516694461555,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.000181761503111171,
+      "loss": 1.3676,
+      "step": 430
+    },
+    {
+      "epoch": 0.27885034330443664,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001811117664331604,
+      "loss": 1.3513,
+      "step": 435
+    },
+    {
+      "epoch": 0.2820555196642578,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.00018045186249970784,
+      "loss": 1.3602,
+      "step": 440
+    },
+    {
+      "epoch": 0.2852606960240789,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.00017978187402892148,
+      "loss": 1.3468,
+      "step": 445
+    },
+    {
+      "epoch": 0.2884658723839,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00017910188500299304,
+      "loss": 1.3651,
+      "step": 450
+    },
+    {
+      "epoch": 0.29167104874372113,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.00017841198065767107,
+      "loss": 1.3763,
+      "step": 455
+    },
+    {
+      "epoch": 0.2948762251035422,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.00017771224747157652,
+      "loss": 1.3597,
+      "step": 460
+    },
+    {
+      "epoch": 0.2980814014633633,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.00017700277315536305,
+      "loss": 1.3558,
+      "step": 465
+    },
+    {
+      "epoch": 0.30128657782318446,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.00017628364664072218,
+      "loss": 1.3534,
+      "step": 470
+    },
+    {
+      "epoch": 0.30449175418300556,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.00017555495806923635,
+      "loss": 1.3525,
+      "step": 475
+    },
+    {
+      "epoch": 0.30769693054282665,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.00017481679878107926,
+      "loss": 1.3715,
+      "step": 480
+    },
+    {
+      "epoch": 0.3109021069026478,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.00017406926130356692,
+      "loss": 1.3689,
+      "step": 485
+    },
+    {
+      "epoch": 0.3141072832624689,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.00017331243933955918,
+      "loss": 1.3686,
+      "step": 490
+    },
+    {
+      "epoch": 0.31731245962229,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.00017254642775571438,
+      "loss": 1.3784,
+      "step": 495
+    },
+    {
+      "epoch": 0.3205176359821111,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.00017177132257059787,
+      "loss": 1.3488,
+      "step": 500
+    },
+    {
+      "epoch": 0.32372281234193223,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.00017098722094264617,
+      "loss": 1.3789,
+      "step": 505
+    },
+    {
+      "epoch": 0.3269279887017533,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.00017019422115798833,
+      "loss": 1.3414,
+      "step": 510
+    },
+    {
+      "epoch": 0.3301331650615744,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001693924226181259,
+      "loss": 1.3667,
+      "step": 515
+    },
+    {
+      "epoch": 0.33333834142139557,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.00016858192582747304,
+      "loss": 1.3749,
+      "step": 520
+    },
+    {
+      "epoch": 0.33654351778121666,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.00016776283238075851,
+      "loss": 1.3929,
+      "step": 525
+    },
+    {
+      "epoch": 0.33974869414103775,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.00016693524495029068,
+      "loss": 1.3527,
+      "step": 530
+    },
+    {
+      "epoch": 0.3429538705008589,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.00016609926727308806,
+      "loss": 1.3577,
+      "step": 535
+    },
+    {
+      "epoch": 0.34615904686068,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.00016525500413787554,
+      "loss": 1.3639,
+      "step": 540
+    },
+    {
+      "epoch": 0.3493642232205011,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.00016440256137194965,
+      "loss": 1.3608,
+      "step": 545
+    },
+    {
+      "epoch": 0.35256939958032224,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001635420458279131,
+      "loss": 1.3324,
+      "step": 550
+    },
+    {
+      "epoch": 0.35577457594014333,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001626735653702809,
+      "loss": 1.3283,
+      "step": 555
+    },
+    {
+      "epoch": 0.3589797522999644,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00016179722886195967,
+      "loss": 1.3287,
+      "step": 560
+    },
+    {
+      "epoch": 0.3621849286597856,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.00016091314615060195,
+      "loss": 1.3799,
+      "step": 565
+    },
+    {
+      "epoch": 0.36539010501960667,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.00016002142805483685,
+      "loss": 1.3399,
+      "step": 570
+    },
+    {
+      "epoch": 0.36859528137942776,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.00015912218635037896,
+      "loss": 1.3698,
+      "step": 575
+    },
+    {
+      "epoch": 0.3718004577392489,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001582155337560177,
+      "loss": 1.3378,
+      "step": 580
+    },
+    {
+      "epoch": 0.37500563409907,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.00015730158391948784,
+      "loss": 1.337,
+      "step": 585
+    },
+    {
+      "epoch": 0.3782108104588911,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001563804514032242,
+      "loss": 1.3527,
+      "step": 590
+    },
+    {
+      "epoch": 0.3814159868187122,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001554522516700011,
+      "loss": 1.3583,
+      "step": 595
+    },
+    {
+      "epoch": 0.38462116317853334,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.00015451710106845955,
+      "loss": 1.3421,
+      "step": 600
+    },
+    {
+      "epoch": 0.38782633953835444,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001535751168185228,
+      "loss": 1.3577,
+      "step": 605
+    },
+    {
+      "epoch": 0.39103151589817553,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.00015262641699670328,
+      "loss": 1.3706,
+      "step": 610
+    },
+    {
+      "epoch": 0.3942366922579967,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001516711205213016,
+      "loss": 1.3439,
+      "step": 615
+    },
+    {
+      "epoch": 0.3974418686178178,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.00015070934713750042,
+      "loss": 1.3353,
+      "step": 620
+    },
+    {
+      "epoch": 0.40064704497763887,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00014974121740235456,
+      "loss": 1.3489,
+      "step": 625
+    },
+    {
+      "epoch": 0.40385222133746,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.00014876685266967924,
+      "loss": 1.3481,
+      "step": 630
+    },
+    {
+      "epoch": 0.4070573976972811,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.00014778637507483866,
+      "loss": 1.3533,
+      "step": 635
+    },
+    {
+      "epoch": 0.4102625740571022,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001467999075194363,
+      "loss": 1.3522,
+      "step": 640
+    },
+    {
+      "epoch": 0.41346775041692335,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.00014580757365590963,
+      "loss": 1.3712,
+      "step": 645
+    },
+    {
+      "epoch": 0.41667292677674445,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00014480949787203014,
+      "loss": 1.3606,
+      "step": 650
+    },
+    {
+      "epoch": 0.41987810313656554,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001438058052753118,
+      "loss": 1.3488,
+      "step": 655
+    },
+    {
+      "epoch": 0.4230832794963867,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.00014279662167732867,
+      "loss": 1.342,
+      "step": 660
+    },
+    {
+      "epoch": 0.4262884558562078,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.00014178207357794486,
+      "loss": 1.3712,
+      "step": 665
+    },
+    {
+      "epoch": 0.4294936322160289,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00014076228814945778,
+      "loss": 1.3227,
+      "step": 670
+    },
+    {
+      "epoch": 0.43269880857585,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.00013973739322065728,
+      "loss": 1.3201,
+      "step": 675
+    },
+    {
+      "epoch": 0.4359039849356711,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00013870751726080256,
+      "loss": 1.3406,
+      "step": 680
+    },
+    {
+      "epoch": 0.4391091612954922,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.00013767278936351854,
+      "loss": 1.3636,
+      "step": 685
+    },
+    {
+      "epoch": 0.4423143376553133,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001366333392306143,
+      "loss": 1.3576,
+      "step": 690
+    },
+    {
+      "epoch": 0.44551951401513445,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.00013558929715582515,
+      "loss": 1.3517,
+      "step": 695
+    },
+    {
+      "epoch": 0.44872469037495555,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.00013454079400848027,
+      "loss": 1.3376,
+      "step": 700
+    },
+    {
+      "epoch": 0.45192986673477664,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.00013348796121709862,
+      "loss": 1.3633,
+      "step": 705
+    },
+    {
+      "epoch": 0.4551350430945978,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00013243093075291444,
+      "loss": 1.3217,
+      "step": 710
+    },
+    {
+      "epoch": 0.4583402194544189,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.00013136983511333482,
+      "loss": 1.3265,
+      "step": 715
+    },
+    {
+      "epoch": 0.46154539581424,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00013030480730533145,
+      "loss": 1.3451,
+      "step": 720
+    },
+    {
+      "epoch": 0.4647505721740611,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.00012923598082876812,
+      "loss": 1.376,
+      "step": 725
+    },
+    {
+      "epoch": 0.4679557485338822,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001281634896596669,
+      "loss": 1.3524,
+      "step": 730
+    },
+    {
+      "epoch": 0.4711609248937033,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.00012708746823341446,
+      "loss": 1.3599,
+      "step": 735
+    },
+    {
+      "epoch": 0.47436610125352446,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.00012600805142791042,
+      "loss": 1.3416,
+      "step": 740
+    },
+    {
+      "epoch": 0.47757127761334556,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.000124925374546661,
+      "loss": 1.3574,
+      "step": 745
+    },
+    {
+      "epoch": 0.48077645397316665,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001238395733018187,
+      "loss": 1.3574,
+      "step": 750
+    },
+    {
+      "epoch": 0.4839816303329878,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.00012275078379717089,
+      "loss": 1.3341,
+      "step": 755
+    },
+    {
+      "epoch": 0.4871868066928089,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.00012165914251107952,
+      "loss": 1.3241,
+      "step": 760
+    },
+    {
+      "epoch": 0.49039198305263,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.00012056478627937365,
+      "loss": 1.3788,
+      "step": 765
+    },
+    {
+      "epoch": 0.4935971594124511,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.00011946785227819726,
+      "loss": 1.3581,
+      "step": 770
+    },
+    {
+      "epoch": 0.49680233577227223,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.00011836847800681443,
+      "loss": 1.3328,
+      "step": 775
+    },
+    {
+      "epoch": 0.5000075121320934,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.00011726680127037401,
+      "loss": 1.3533,
+      "step": 780
+    },
+    {
+      "epoch": 0.5032126884919145,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.00011616296016263582,
+      "loss": 1.3622,
+      "step": 785
+    },
+    {
+      "epoch": 0.5064178648517356,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.00011505709304866084,
+      "loss": 1.3446,
+      "step": 790
+    },
+    {
+      "epoch": 0.5096230412115567,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.00011394933854746733,
+      "loss": 1.3384,
+      "step": 795
+    },
+    {
+      "epoch": 0.5128282175713778,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.00011283983551465511,
+      "loss": 1.3378,
+      "step": 800
+    },
+    {
+      "epoch": 0.5160333939311988,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.00011172872302500017,
+      "loss": 1.3656,
+      "step": 805
+    },
+    {
+      "epoch": 0.51923857029102,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00011061614035502193,
+      "loss": 1.3521,
+      "step": 810
+    },
+    {
+      "epoch": 0.5224437466508411,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00010950222696552486,
+      "loss": 1.3614,
+      "step": 815
+    },
+    {
+      "epoch": 0.5256489230106622,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.00010838712248411753,
+      "loss": 1.3314,
+      "step": 820
+    },
+    {
+      "epoch": 0.5288540993704833,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.00010727096668771036,
+      "loss": 1.338,
+      "step": 825
+    },
+    {
+      "epoch": 0.5320592757303044,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001061538994849946,
+      "loss": 1.3611,
+      "step": 830
+    },
+    {
+      "epoch": 0.5352644520901255,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.00010503606089890529,
+      "loss": 1.3175,
+      "step": 835
+    },
+    {
+      "epoch": 0.5384696284499466,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.00010391759104906928,
+      "loss": 1.3525,
+      "step": 840
+    },
+    {
+      "epoch": 0.5416748048097678,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.00010279863013424154,
+      "loss": 1.3313,
+      "step": 845
+    },
+    {
+      "epoch": 0.5448799811695889,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.00010167931841473142,
+      "loss": 1.3349,
+      "step": 850
+    },
+    {
+      "epoch": 0.54808515752941,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.00010055979619482112,
+      "loss": 1.3408,
+      "step": 855
+    },
+    {
+      "epoch": 0.5512903338892311,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 9.944020380517889e-05,
+      "loss": 1.3175,
+      "step": 860
+    },
+    {
+      "epoch": 0.5544955102490522,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 9.832068158526862e-05,
+      "loss": 1.3375,
+      "step": 865
+    },
+    {
+      "epoch": 0.5577006866088733,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 9.720136986575848e-05,
+      "loss": 1.3475,
+      "step": 870
+    },
+    {
+      "epoch": 0.5609058629686945,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 9.608240895093076e-05,
+      "loss": 1.3295,
+      "step": 875
+    },
+    {
+      "epoch": 0.5641110393285156,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 9.496393910109472e-05,
+      "loss": 1.3429,
+      "step": 880
+    },
+    {
+      "epoch": 0.5673162156883367,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 9.384610051500545e-05,
+      "loss": 1.3293,
+      "step": 885
+    },
+    {
+      "epoch": 0.5705213920481578,
+      "grad_norm": 0.052734375,
+      "learning_rate": 9.272903331228968e-05,
+      "loss": 1.3498,
+      "step": 890
+    },
+    {
+      "epoch": 0.5737265684079789,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 9.161287751588248e-05,
+      "loss": 1.3351,
+      "step": 895
+    },
+    {
+      "epoch": 0.5769317447678,
+      "grad_norm": 0.064453125,
+      "learning_rate": 9.049777303447516e-05,
+      "loss": 1.353,
+      "step": 900
+    },
+    {
+      "epoch": 0.5801369211276212,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 8.938385964497808e-05,
+      "loss": 1.3363,
+      "step": 905
+    },
+    {
+      "epoch": 0.5833420974874423,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 8.827127697499984e-05,
+      "loss": 1.3696,
+      "step": 910
+    },
+    {
+      "epoch": 0.5865472738472634,
+      "grad_norm": 0.080078125,
+      "learning_rate": 8.71601644853449e-05,
+      "loss": 1.3481,
+      "step": 915
+    },
+    {
+      "epoch": 0.5897524502070844,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 8.605066145253268e-05,
+      "loss": 1.3256,
+      "step": 920
+    },
+    {
+      "epoch": 0.5929576265669055,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 8.494290695133917e-05,
+      "loss": 1.3544,
+      "step": 925
+    },
+    {
+      "epoch": 0.5961628029267266,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 8.383703983736419e-05,
+      "loss": 1.3443,
+      "step": 930
+    },
+    {
+      "epoch": 0.5993679792865477,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 8.2733198729626e-05,
+      "loss": 1.3816,
+      "step": 935
+    },
+    {
+      "epoch": 0.6025731556463689,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 8.163152199318558e-05,
+      "loss": 1.3247,
+      "step": 940
+    },
+    {
+      "epoch": 0.60577833200619,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 8.053214772180277e-05,
+      "loss": 1.3532,
+      "step": 945
+    },
+    {
+      "epoch": 0.6089835083660111,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 7.94352137206264e-05,
+      "loss": 1.3443,
+      "step": 950
+    },
+    {
+      "epoch": 0.6121886847258322,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 7.83408574889205e-05,
+      "loss": 1.3327,
+      "step": 955
+    },
+    {
+      "epoch": 0.6153938610856533,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 7.724921620282916e-05,
+      "loss": 1.334,
+      "step": 960
+    },
+    {
+      "epoch": 0.6185990374454744,
+      "grad_norm": 0.0703125,
+      "learning_rate": 7.616042669818133e-05,
+      "loss": 1.3572,
+      "step": 965
+    },
+    {
+      "epoch": 0.6218042138052956,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 7.507462545333903e-05,
+      "loss": 1.3322,
+      "step": 970
+    },
+    {
+      "epoch": 0.6250093901651167,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 7.399194857208961e-05,
+      "loss": 1.3222,
+      "step": 975
+    },
+    {
+      "epoch": 0.6282145665249378,
+      "grad_norm": 0.05078125,
+      "learning_rate": 7.291253176658561e-05,
+      "loss": 1.3375,
+      "step": 980
+    },
+    {
+      "epoch": 0.6314197428847589,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 7.183651034033313e-05,
+      "loss": 1.3397,
+      "step": 985
+    },
+    {
+      "epoch": 0.63462491924458,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 7.07640191712319e-05,
+      "loss": 1.34,
+      "step": 990
+    },
+    {
+      "epoch": 0.6378300956044011,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 6.969519269466857e-05,
+      "loss": 1.3344,
+      "step": 995
+    },
+    {
+      "epoch": 0.6410352719642222,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 6.863016488666517e-05,
+      "loss": 1.3475,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6442404483240434,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 6.756906924708558e-05,
+      "loss": 1.3317,
+      "step": 1005
+    },
+    {
+      "epoch": 0.6474456246838645,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 6.651203878290139e-05,
+      "loss": 1.3243,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6506508010436856,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 6.545920599151975e-05,
+      "loss": 1.3351,
+      "step": 1015
+    },
+    {
+      "epoch": 0.6538559774035066,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 6.441070284417487e-05,
+      "loss": 1.3536,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6570611537633277,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 6.336666076938572e-05,
+      "loss": 1.3064,
+      "step": 1025
+    },
+    {
+      "epoch": 0.6602663301231488,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 6.232721063648148e-05,
+      "loss": 1.3496,
+      "step": 1030
+    },
+    {
+      "epoch": 0.66347150648297,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 6.12924827391975e-05,
+      "loss": 1.3487,
+      "step": 1035
+    },
+    {
+      "epoch": 0.6666766828427911,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 6.026260677934272e-05,
+      "loss": 1.3241,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6698818592026122,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 5.9237711850542246e-05,
+      "loss": 1.3454,
+      "step": 1045
+    },
+    {
+      "epoch": 0.6730870355624333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 5.8217926422055126e-05,
+      "loss": 1.3364,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6762922119222544,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 5.7203378322671355e-05,
+      "loss": 1.3152,
+      "step": 1055
+    },
+    {
+      "epoch": 0.6794973882820755,
+      "grad_norm": 0.0546875,
+      "learning_rate": 5.619419472468823e-05,
+      "loss": 1.3486,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6827025646418967,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 5.519050212796986e-05,
+      "loss": 1.3301,
+      "step": 1065
+    },
+    {
+      "epoch": 0.6859077410017178,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 5.419242634409039e-05,
+      "loss": 1.3279,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6891129173615389,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 5.32000924805637e-05,
+      "loss": 1.3415,
+      "step": 1075
+    },
+    {
+      "epoch": 0.69231809372136,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 5.2213624925161386e-05,
+      "loss": 1.3449,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6955232700811811,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 5.123314733032074e-05,
+      "loss": 1.3442,
+      "step": 1085
+    },
+    {
+      "epoch": 0.6987284464410022,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 5.0258782597645446e-05,
+      "loss": 1.3309,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7019336228008233,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 4.929065286249959e-05,
+      "loss": 1.3564,
+      "step": 1095
+    },
+    {
+      "epoch": 0.7051387991606445,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 4.832887947869841e-05,
+      "loss": 1.3578,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7083439755204656,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 4.737358300329673e-05,
+      "loss": 1.3417,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7115491518802867,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 4.642488318147723e-05,
+      "loss": 1.3259,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7147543282401078,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 4.548289893154051e-05,
+      "loss": 1.3568,
+      "step": 1115
+    },
+    {
+      "epoch": 0.7179595045999289,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 4.4547748329998925e-05,
+      "loss": 1.3211,
+      "step": 1120
+    },
+    {
+      "epoch": 0.72116468095975,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 4.361954859677584e-05,
+      "loss": 1.3398,
+      "step": 1125
+    },
+    {
+      "epoch": 0.7243698573195712,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 4.2698416080512204e-05,
+      "loss": 1.3266,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7275750336793922,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 4.1784466243982324e-05,
+      "loss": 1.3447,
+      "step": 1135
+    },
+    {
+      "epoch": 0.7307802100392133,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 4.0877813649621076e-05,
+      "loss": 1.3385,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7339853863990344,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 3.997857194516319e-05,
+      "loss": 1.3403,
+      "step": 1145
+    },
+    {
+      "epoch": 0.7371905627588555,
+      "grad_norm": 0.05078125,
+      "learning_rate": 3.9086853849398065e-05,
+      "loss": 1.3503,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7403957391186766,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 3.8202771138040336e-05,
+      "loss": 1.3354,
+      "step": 1155
+    },
+    {
+      "epoch": 0.7436009154784978,
+      "grad_norm": 0.05078125,
+      "learning_rate": 3.732643462971912e-05,
+      "loss": 1.3258,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7468060918383189,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 3.6457954172086896e-05,
+      "loss": 1.3493,
+      "step": 1165
+    },
+    {
+      "epoch": 0.75001126819814,
+      "grad_norm": 0.046875,
+      "learning_rate": 3.559743862805034e-05,
+      "loss": 1.3275,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7532164445579611,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 3.47449958621245e-05,
+      "loss": 1.3148,
+      "step": 1175
+    },
+    {
+      "epoch": 0.7564216209177822,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 3.390073272691198e-05,
+      "loss": 1.3338,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7596267972776033,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 3.306475504970931e-05,
+      "loss": 1.2935,
+      "step": 1185
+    },
+    {
+      "epoch": 0.7628319736374244,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 3.2237167619241495e-05,
+      "loss": 1.3275,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7660371499972456,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 3.141807417252697e-05,
+      "loss": 1.3461,
+      "step": 1195
+    },
+    {
+      "epoch": 0.7692423263570667,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 3.060757738187409e-05,
+      "loss": 1.3394,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7724475027168878,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 2.980577884201169e-05,
+      "loss": 1.3511,
+      "step": 1205
+    },
+    {
+      "epoch": 0.7756526790767089,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 2.9012779057353855e-05,
+      "loss": 1.3213,
+      "step": 1210
+    },
+    {
+      "epoch": 0.77885785543653,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 2.822867742940214e-05,
+      "loss": 1.3384,
+      "step": 1215
+    },
+    {
+      "epoch": 0.7820630317963511,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 2.745357224428563e-05,
+      "loss": 1.343,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7852682081561723,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 2.6687560660440858e-05,
+      "loss": 1.3541,
+      "step": 1225
+    },
+    {
+      "epoch": 0.7884733845159934,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 2.593073869643312e-05,
+      "loss": 1.3491,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7916785608758145,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 2.518320121892076e-05,
+      "loss": 1.3439,
+      "step": 1235
+    },
+    {
+      "epoch": 0.7948837372356355,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 2.4445041930763678e-05,
+      "loss": 1.3236,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7980889135954566,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.371635335927781e-05,
+      "loss": 1.3505,
+      "step": 1245
+    },
+    {
+      "epoch": 0.8012940899552777,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 2.2997226844636977e-05,
+      "loss": 1.3223,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8044992663150989,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 2.2287752528423468e-05,
+      "loss": 1.3282,
+      "step": 1255
+    },
+    {
+      "epoch": 0.80770444267492,
+      "grad_norm": 0.046875,
+      "learning_rate": 2.1588019342328968e-05,
+      "loss": 1.3294,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8109096190347411,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 2.089811499700699e-05,
+      "loss": 1.3356,
+      "step": 1265
+    },
+    {
+      "epoch": 0.8141147953945622,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 2.021812597107855e-05,
+      "loss": 1.3486,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8173199717543833,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 1.954813750029216e-05,
+      "loss": 1.3492,
+      "step": 1275
+    },
+    {
+      "epoch": 0.8205251481142044,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 1.8888233566839653e-05,
+      "loss": 1.329,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8237303244740255,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 1.8238496888828982e-05,
+      "loss": 1.317,
+      "step": 1285
+    },
+    {
+      "epoch": 0.8269355008338467,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 1.759900890991589e-05,
+      "loss": 1.3177,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8301406771936678,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 1.696984978909476e-05,
+      "loss": 1.323,
+      "step": 1295
+    },
+    {
+      "epoch": 0.8333458535534889,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 1.6351098390650966e-05,
+      "loss": 1.3517,
+      "step": 1300
+    },
+    {
+      "epoch": 0.83655102991331,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 1.5742832274275288e-05,
+      "loss": 1.35,
+      "step": 1305
+    },
+    {
+      "epoch": 0.8397562062731311,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 1.514512768534193e-05,
+      "loss": 1.3614,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8429613826329522,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 1.4558059545351143e-05,
+      "loss": 1.3389,
+      "step": 1315
+    },
+    {
+      "epoch": 0.8461665589927734,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 1.3981701442538153e-05,
+      "loss": 1.3272,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8493717353525945,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 1.3416125622648668e-05,
+      "loss": 1.3324,
+      "step": 1325
+    },
+    {
+      "epoch": 0.8525769117124156,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 1.286140297988323e-05,
+      "loss": 1.3352,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8557820880722367,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 1.231760304801054e-05,
+      "loss": 1.3361,
+      "step": 1335
+    },
+    {
+      "epoch": 0.8589872644320578,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 1.1784793991651621e-05,
+      "loss": 1.3252,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8621924407918788,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 1.1263042597735362e-05,
+      "loss": 1.3468,
+      "step": 1345
+    },
+    {
+      "epoch": 0.8653976171517,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 1.0752414267126875e-05,
+      "loss": 1.3301,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8686027935115211,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 1.0252973006429733e-05,
+      "loss": 1.36,
+      "step": 1355
+    },
+    {
+      "epoch": 0.8718079698713422,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 9.764781419962577e-06,
+      "loss": 1.3482,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8750131462311633,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 9.287900701911944e-06,
+      "loss": 1.3232,
+      "step": 1365
+    },
+    {
+      "epoch": 0.8782183225909844,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 8.822390628661582e-06,
+      "loss": 1.3571,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8814234989508055,
+      "grad_norm": 0.044921875,
+      "learning_rate": 8.368309551299536e-06,
+      "loss": 1.3274,
+      "step": 1375
+    },
+    {
+      "epoch": 0.8846286753106266,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 7.92571438830394e-06,
+      "loss": 1.3656,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8878338516704478,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 7.494660618408378e-06,
+      "loss": 1.3659,
+      "step": 1385
+    },
+    {
+      "epoch": 0.8910390280302689,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 7.075202273647652e-06,
+      "loss": 1.3305,
+      "step": 1390
+    },
+    {
+      "epoch": 0.89424420439009,
+      "grad_norm": 0.046875,
+      "learning_rate": 6.667391932584999e-06,
+      "loss": 1.36,
+      "step": 1395
+    },
+    {
+      "epoch": 0.8974493807499111,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 6.271280713721317e-06,
+      "loss": 1.3382,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9006545571097322,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 5.886918269087716e-06,
+      "loss": 1.326,
+      "step": 1405
+    },
+    {
+      "epoch": 0.9038597334695533,
+      "grad_norm": 0.046875,
+      "learning_rate": 5.514352778021492e-06,
+      "loss": 1.3602,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9070649098293745,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 5.153630941127063e-06,
+      "loss": 1.3407,
+      "step": 1415
+    },
+    {
+      "epoch": 0.9102700861891956,
+      "grad_norm": 0.046875,
+      "learning_rate": 4.804797974422026e-06,
+      "loss": 1.3241,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9134752625490167,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 4.4678976036694355e-06,
+      "loss": 1.3324,
+      "step": 1425
+    },
+    {
+      "epoch": 0.9166804389088378,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 4.142972058896811e-06,
+      "loss": 1.3267,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9198856152686589,
+      "grad_norm": 0.044921875,
+      "learning_rate": 3.830062069102602e-06,
+      "loss": 1.3496,
+      "step": 1435
+    },
+    {
+      "epoch": 0.92309079162848,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 3.529206857151035e-06,
+      "loss": 1.3481,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9262959679883012,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 3.240444134855347e-06,
+      "loss": 1.3433,
+      "step": 1445
+    },
+    {
+      "epoch": 0.9295011443481223,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 2.963810098250841e-06,
+      "loss": 1.3555,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9327063207079433,
+      "grad_norm": 0.044921875,
+      "learning_rate": 2.6993394230576674e-06,
+      "loss": 1.3218,
+      "step": 1455
+    },
+    {
+      "epoch": 0.9359114970677644,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 2.4470652603343023e-06,
+      "loss": 1.346,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9391166734275855,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 2.2070192323220607e-06,
+      "loss": 1.3551,
+      "step": 1465
+    },
+    {
+      "epoch": 0.9423218497874066,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 1.9792314284813986e-06,
+      "loss": 1.3262,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9455270261472277,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 1.763730401720065e-06,
+      "loss": 1.3257,
+      "step": 1475
+    },
+    {
+      "epoch": 0.9487322025070489,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 1.5605431648141878e-06,
+      "loss": 1.3158,
+      "step": 1480
+    },
+    {
+      "epoch": 0.95193737886687,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 1.3696951870222018e-06,
+      "loss": 1.3637,
+      "step": 1485
+    },
+    {
+      "epoch": 0.9551425552266911,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 1.1912103908922945e-06,
+      "loss": 1.3337,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9583477315865122,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 1.0251111492637244e-06,
+      "loss": 1.3557,
+      "step": 1495
+    },
+    {
+      "epoch": 0.9615529079463333,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 8.714182824624883e-07,
+      "loss": 1.3373,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9647580843061544,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 7.301510556914859e-07,
+      "loss": 1.3274,
+      "step": 1505
+    },
+    {
+      "epoch": 0.9679632606659756,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 6.01327176615607e-07,
+      "loss": 1.3894,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9711684370257967,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 4.84962793142163e-07,
+      "loss": 1.3419,
+      "step": 1515
+    },
+    {
+      "epoch": 0.9743736133856178,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 3.8107249139672783e-07,
+      "loss": 1.3321,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9775787897454389,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.89669293894812e-07,
+      "loss": 1.3497,
+      "step": 1525
+    },
+    {
+      "epoch": 0.98078396610526,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 2.1076465790946798e-07,
+      "loss": 1.3518,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9839891424650811,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 1.443684740351947e-07,
+      "loss": 1.3224,
+      "step": 1535
+    },
+    {
+      "epoch": 0.9871943188249022,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 9.048906494811826e-08,
+      "loss": 1.3513,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9903994951847234,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 4.9133184362748497e-08,
+      "loss": 1.3494,
+      "step": 1545
+    },
+    {
+      "epoch": 0.9936046715445445,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 2.0306016185456244e-08,
+      "loss": 1.3344,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9968098479043656,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 4.011173864637563e-09,
+      "loss": 1.3662,
+      "step": 1555
+    },
+    {
+      "epoch": 0.9993739889922224,
+      "eval_loss": 1.4191993474960327,
+      "eval_runtime": 1938.5869,
+      "eval_samples_per_second": 7.3,
+      "eval_steps_per_second": 7.3,
+      "step": 1559
+    },
+    {
+      "epoch": 0.9993739889922224,
+      "step": 1559,
+      "total_flos": 3.232184148701479e+18,
+      "train_loss": 0.016414370117774753,
+      "train_runtime": 2971.8566,
+      "train_samples_per_second": 67.189,
+      "train_steps_per_second": 0.525
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1559,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 20,
+  "total_flos": 3.232184148701479e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9368167a7c8c2e5afaeb29db0e275336ff5bb5ad11c57c441e568b0c63b3082
+size 5112

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff