rayonlabs

alicegoesdown commited on 18 days ago

Commit

a25fd62

verified ·

0 Parent(s):

Duplicate from alicegoesdown/46x2400b14

Browse files

Co-authored-by: alice goesdown <[email protected]>

Files changed (24) hide show

.gitattributes +35 -0
adapter_config.json +33 -0
adapter_model.bin +3 -0
adapter_model.safetensors +3 -0
config.json +32 -0
last-checkpoint/README.md +202 -0
last-checkpoint/adapter_config.json +33 -0
last-checkpoint/adapter_model.safetensors +3 -0
last-checkpoint/merges.txt +0 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/special_tokens_map.json +30 -0
last-checkpoint/tokenizer.json +0 -0
last-checkpoint/tokenizer_config.json +31 -0
last-checkpoint/trainer_state.json +2978 -0
last-checkpoint/training_args.bin +3 -0
last-checkpoint/vocab.json +0 -0
merges.txt +0 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +31 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "facebook/opt-125m",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "fc1",
+    "fc2",
+    "out_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca30eea9845254182cef3b7d6dec4966262856e7b52e8b7eb7b1bed47fe8b5bd
+size 5360906

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eab8a9e2aded79d9edbd838f89021f6629087b8d20793052287f02d4fc50acf1
+size 5327744

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "facebook/opt-125m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "OPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": true,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 3072,
+  "hidden_size": 768,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "opt",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.0",
+  "use_cache": false,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 768
+}

last-checkpoint/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: facebook/opt-125m
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

last-checkpoint/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "facebook/opt-125m",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "fc1",
+    "fc2",
+    "out_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

last-checkpoint/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eab8a9e2aded79d9edbd838f89021f6629087b8d20793052287f02d4fc50acf1
+size 5327744

last-checkpoint/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f18488d59c86cac0d45b865b7a692172eeeb7eedd8303b0795d73cce699646d
+size 10737850

last-checkpoint/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17b42f753a16fbf1cd2ed2780fea89be5d87d2f94c9c441b092817f34cc12686
+size 14244

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55d4277b35ca6e69ad50678714934604d373a92646f8769316d12fb42bd1f1d9
+size 1064

last-checkpoint/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

last-checkpoint/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "add_bos_token": true,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "errors": "replace",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "</s>"
+}

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2978 @@

+{
+  "best_metric": 2.152632236480713,
+  "best_model_checkpoint": "miner_id_24/checkpoint-400",
+  "epoch": 0.49520272361497986,
+  "eval_steps": 25,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012380068090374497,
+      "grad_norm": 2.818559408187866,
+      "learning_rate": 2.4999999999999998e-05,
+      "loss": 18.9651,
+      "step": 1
+    },
+    {
+      "epoch": 0.0012380068090374497,
+      "eval_loss": 2.4708304405212402,
+      "eval_runtime": 0.8454,
+      "eval_samples_per_second": 59.141,
+      "eval_steps_per_second": 8.28,
+      "step": 1
+    },
+    {
+      "epoch": 0.0024760136180748994,
+      "grad_norm": 3.5434861183166504,
+      "learning_rate": 4.9999999999999996e-05,
+      "loss": 20.8432,
+      "step": 2
+    },
+    {
+      "epoch": 0.003714020427112349,
+      "grad_norm": 3.321415901184082,
+      "learning_rate": 7.5e-05,
+      "loss": 20.0992,
+      "step": 3
+    },
+    {
+      "epoch": 0.004952027236149799,
+      "grad_norm": 3.442756175994873,
+      "learning_rate": 9.999999999999999e-05,
+      "loss": 20.5289,
+      "step": 4
+    },
+    {
+      "epoch": 0.0061900340451872485,
+      "grad_norm": 3.195467233657837,
+      "learning_rate": 0.000125,
+      "loss": 20.5212,
+      "step": 5
+    },
+    {
+      "epoch": 0.007428040854224698,
+      "grad_norm": 3.0753700733184814,
+      "learning_rate": 0.00015,
+      "loss": 20.0682,
+      "step": 6
+    },
+    {
+      "epoch": 0.008666047663262148,
+      "grad_norm": 3.025089740753174,
+      "learning_rate": 0.000175,
+      "loss": 20.6486,
+      "step": 7
+    },
+    {
+      "epoch": 0.009904054472299598,
+      "grad_norm": 3.266080617904663,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 20.9458,
+      "step": 8
+    },
+    {
+      "epoch": 0.011142061281337047,
+      "grad_norm": 3.6842172145843506,
+      "learning_rate": 0.000225,
+      "loss": 20.432,
+      "step": 9
+    },
+    {
+      "epoch": 0.012380068090374497,
+      "grad_norm": 4.245394229888916,
+      "learning_rate": 0.00025,
+      "loss": 20.469,
+      "step": 10
+    },
+    {
+      "epoch": 0.013618074899411947,
+      "grad_norm": 3.6699397563934326,
+      "learning_rate": 0.00027499999999999996,
+      "loss": 20.0793,
+      "step": 11
+    },
+    {
+      "epoch": 0.014856081708449397,
+      "grad_norm": 3.6665425300598145,
+      "learning_rate": 0.0003,
+      "loss": 21.1171,
+      "step": 12
+    },
+    {
+      "epoch": 0.016094088517486844,
+      "grad_norm": 3.9322142601013184,
+      "learning_rate": 0.0002999955747511861,
+      "loss": 20.6365,
+      "step": 13
+    },
+    {
+      "epoch": 0.017332095326524296,
+      "grad_norm": 3.919208526611328,
+      "learning_rate": 0.00029998229929486034,
+      "loss": 21.118,
+      "step": 14
+    },
+    {
+      "epoch": 0.018570102135561744,
+      "grad_norm": 4.233679294586182,
+      "learning_rate": 0.00029996017450135157,
+      "loss": 21.0626,
+      "step": 15
+    },
+    {
+      "epoch": 0.019808108944599195,
+      "grad_norm": 4.335453510284424,
+      "learning_rate": 0.0002999292018211445,
+      "loss": 20.6267,
+      "step": 16
+    },
+    {
+      "epoch": 0.021046115753636643,
+      "grad_norm": 3.9314210414886475,
+      "learning_rate": 0.0002998893832847845,
+      "loss": 20.9186,
+      "step": 17
+    },
+    {
+      "epoch": 0.022284122562674095,
+      "grad_norm": 4.088806629180908,
+      "learning_rate": 0.0002998407215027447,
+      "loss": 20.3986,
+      "step": 18
+    },
+    {
+      "epoch": 0.023522129371711543,
+      "grad_norm": 4.220116138458252,
+      "learning_rate": 0.0002997832196652546,
+      "loss": 20.9794,
+      "step": 19
+    },
+    {
+      "epoch": 0.024760136180748994,
+      "grad_norm": 4.280047416687012,
+      "learning_rate": 0.00029971688154209106,
+      "loss": 20.5515,
+      "step": 20
+    },
+    {
+      "epoch": 0.025998142989786442,
+      "grad_norm": 4.594794273376465,
+      "learning_rate": 0.00029964171148233107,
+      "loss": 20.916,
+      "step": 21
+    },
+    {
+      "epoch": 0.027236149798823894,
+      "grad_norm": 4.3702802658081055,
+      "learning_rate": 0.00029955771441406685,
+      "loss": 20.3802,
+      "step": 22
+    },
+    {
+      "epoch": 0.02847415660786134,
+      "grad_norm": 4.664053916931152,
+      "learning_rate": 0.0002994648958440824,
+      "loss": 22.0276,
+      "step": 23
+    },
+    {
+      "epoch": 0.029712163416898793,
+      "grad_norm": 5.1516804695129395,
+      "learning_rate": 0.00029936326185749286,
+      "loss": 21.4184,
+      "step": 24
+    },
+    {
+      "epoch": 0.03095017022593624,
+      "grad_norm": 5.225708484649658,
+      "learning_rate": 0.0002992528191173453,
+      "loss": 20.4457,
+      "step": 25
+    },
+    {
+      "epoch": 0.03095017022593624,
+      "eval_loss": 2.320787191390991,
+      "eval_runtime": 0.4599,
+      "eval_samples_per_second": 108.715,
+      "eval_steps_per_second": 15.22,
+      "step": 25
+    },
+    {
+      "epoch": 0.03218817703497369,
+      "grad_norm": 5.942795276641846,
+      "learning_rate": 0.00029913357486418196,
+      "loss": 21.6554,
+      "step": 26
+    },
+    {
+      "epoch": 0.033426183844011144,
+      "grad_norm": 5.838959693908691,
+      "learning_rate": 0.0002990055369155657,
+      "loss": 19.8471,
+      "step": 27
+    },
+    {
+      "epoch": 0.03466419065304859,
+      "grad_norm": 6.582411289215088,
+      "learning_rate": 0.0002988687136655674,
+      "loss": 20.5919,
+      "step": 28
+    },
+    {
+      "epoch": 0.03590219746208604,
+      "grad_norm": 6.935208797454834,
+      "learning_rate": 0.00029872311408421547,
+      "loss": 21.3772,
+      "step": 29
+    },
+    {
+      "epoch": 0.03714020427112349,
+      "grad_norm": 7.257685661315918,
+      "learning_rate": 0.00029856874771690806,
+      "loss": 21.216,
+      "step": 30
+    },
+    {
+      "epoch": 0.03837821108016094,
+      "grad_norm": 7.300308704376221,
+      "learning_rate": 0.0002984056246837872,
+      "loss": 21.4553,
+      "step": 31
+    },
+    {
+      "epoch": 0.03961621788919839,
+      "grad_norm": 9.471358299255371,
+      "learning_rate": 0.0002982337556790752,
+      "loss": 21.2277,
+      "step": 32
+    },
+    {
+      "epoch": 0.04085422469823584,
+      "grad_norm": 8.72948169708252,
+      "learning_rate": 0.0002980531519703738,
+      "loss": 20.7242,
+      "step": 33
+    },
+    {
+      "epoch": 0.04209223150727329,
+      "grad_norm": 9.603164672851562,
+      "learning_rate": 0.000297863825397925,
+      "loss": 20.3744,
+      "step": 34
+    },
+    {
+      "epoch": 0.04333023831631074,
+      "grad_norm": 10.077178955078125,
+      "learning_rate": 0.00029766578837383547,
+      "loss": 20.4889,
+      "step": 35
+    },
+    {
+      "epoch": 0.04456824512534819,
+      "grad_norm": 11.101947784423828,
+      "learning_rate": 0.0002974590538812622,
+      "loss": 20.8914,
+      "step": 36
+    },
+    {
+      "epoch": 0.04580625193438564,
+      "grad_norm": 10.848258972167969,
+      "learning_rate": 0.00029724363547356194,
+      "loss": 18.5333,
+      "step": 37
+    },
+    {
+      "epoch": 0.047044258743423086,
+      "grad_norm": 10.411460876464844,
+      "learning_rate": 0.00029701954727340204,
+      "loss": 20.7163,
+      "step": 38
+    },
+    {
+      "epoch": 0.04828226555246054,
+      "grad_norm": 11.610359191894531,
+      "learning_rate": 0.00029678680397183485,
+      "loss": 21.2355,
+      "step": 39
+    },
+    {
+      "epoch": 0.04952027236149799,
+      "grad_norm": 12.435564041137695,
+      "learning_rate": 0.000296545420827335,
+      "loss": 20.7987,
+      "step": 40
+    },
+    {
+      "epoch": 0.050758279170535436,
+      "grad_norm": 13.064867973327637,
+      "learning_rate": 0.0002962954136647982,
+      "loss": 21.3197,
+      "step": 41
+    },
+    {
+      "epoch": 0.051996285979572884,
+      "grad_norm": 13.486992835998535,
+      "learning_rate": 0.0002960367988745045,
+      "loss": 24.3337,
+      "step": 42
+    },
+    {
+      "epoch": 0.05323429278861034,
+      "grad_norm": 13.281498908996582,
+      "learning_rate": 0.0002957695934110434,
+      "loss": 22.8852,
+      "step": 43
+    },
+    {
+      "epoch": 0.05447229959764779,
+      "grad_norm": 13.668049812316895,
+      "learning_rate": 0.0002954938147922025,
+      "loss": 21.0772,
+      "step": 44
+    },
+    {
+      "epoch": 0.055710306406685235,
+      "grad_norm": 15.47207260131836,
+      "learning_rate": 0.0002952094810978189,
+      "loss": 21.8438,
+      "step": 45
+    },
+    {
+      "epoch": 0.05694831321572268,
+      "grad_norm": 15.751750946044922,
+      "learning_rate": 0.00029491661096859407,
+      "loss": 23.0924,
+      "step": 46
+    },
+    {
+      "epoch": 0.05818632002476014,
+      "grad_norm": 17.3387393951416,
+      "learning_rate": 0.0002946152236048715,
+      "loss": 24.2195,
+      "step": 47
+    },
+    {
+      "epoch": 0.059424326833797586,
+      "grad_norm": 19.328227996826172,
+      "learning_rate": 0.00029430533876537824,
+      "loss": 25.2434,
+      "step": 48
+    },
+    {
+      "epoch": 0.060662333642835034,
+      "grad_norm": 26.080963134765625,
+      "learning_rate": 0.0002939869767659294,
+      "loss": 26.2704,
+      "step": 49
+    },
+    {
+      "epoch": 0.06190034045187248,
+      "grad_norm": 58.686161041259766,
+      "learning_rate": 0.0002936601584780962,
+      "loss": 31.1293,
+      "step": 50
+    },
+    {
+      "epoch": 0.06190034045187248,
+      "eval_loss": 3.0001349449157715,
+      "eval_runtime": 0.4579,
+      "eval_samples_per_second": 109.203,
+      "eval_steps_per_second": 15.288,
+      "step": 50
+    },
+    {
+      "epoch": 0.06313834726090993,
+      "grad_norm": 101.44572448730469,
+      "learning_rate": 0.0002933249053278378,
+      "loss": 26.0663,
+      "step": 51
+    },
+    {
+      "epoch": 0.06437635406994738,
+      "grad_norm": 92.61017608642578,
+      "learning_rate": 0.00029298123929409647,
+      "loss": 26.2133,
+      "step": 52
+    },
+    {
+      "epoch": 0.06561436087898484,
+      "grad_norm": 67.51494598388672,
+      "learning_rate": 0.00029262918290735683,
+      "loss": 25.3339,
+      "step": 53
+    },
+    {
+      "epoch": 0.06685236768802229,
+      "grad_norm": 38.48495101928711,
+      "learning_rate": 0.0002922687592481686,
+      "loss": 23.3798,
+      "step": 54
+    },
+    {
+      "epoch": 0.06809037449705974,
+      "grad_norm": 26.016254425048828,
+      "learning_rate": 0.0002918999919456335,
+      "loss": 22.147,
+      "step": 55
+    },
+    {
+      "epoch": 0.06932838130609718,
+      "grad_norm": 18.342018127441406,
+      "learning_rate": 0.00029152290517585637,
+      "loss": 21.2174,
+      "step": 56
+    },
+    {
+      "epoch": 0.07056638811513463,
+      "grad_norm": 14.274846076965332,
+      "learning_rate": 0.00029113752366035977,
+      "loss": 22.0883,
+      "step": 57
+    },
+    {
+      "epoch": 0.07180439492417208,
+      "grad_norm": 9.59068489074707,
+      "learning_rate": 0.0002907438726644637,
+      "loss": 21.2135,
+      "step": 58
+    },
+    {
+      "epoch": 0.07304240173320953,
+      "grad_norm": 7.411025047302246,
+      "learning_rate": 0.0002903419779956289,
+      "loss": 20.9369,
+      "step": 59
+    },
+    {
+      "epoch": 0.07428040854224698,
+      "grad_norm": 6.899316310882568,
+      "learning_rate": 0.00028993186600176504,
+      "loss": 20.4407,
+      "step": 60
+    },
+    {
+      "epoch": 0.07551841535128444,
+      "grad_norm": 6.684574604034424,
+      "learning_rate": 0.00028951356356950354,
+      "loss": 20.8978,
+      "step": 61
+    },
+    {
+      "epoch": 0.07675642216032189,
+      "grad_norm": 6.599917411804199,
+      "learning_rate": 0.00028908709812243465,
+      "loss": 20.2899,
+      "step": 62
+    },
+    {
+      "epoch": 0.07799442896935933,
+      "grad_norm": 5.652791500091553,
+      "learning_rate": 0.0002886524976193096,
+      "loss": 20.2204,
+      "step": 63
+    },
+    {
+      "epoch": 0.07923243577839678,
+      "grad_norm": 4.608004093170166,
+      "learning_rate": 0.0002882097905522079,
+      "loss": 19.3632,
+      "step": 64
+    },
+    {
+      "epoch": 0.08047044258743423,
+      "grad_norm": 4.681418418884277,
+      "learning_rate": 0.00028775900594466914,
+      "loss": 20.2131,
+      "step": 65
+    },
+    {
+      "epoch": 0.08170844939647168,
+      "grad_norm": 4.387142658233643,
+      "learning_rate": 0.0002873001733497903,
+      "loss": 20.5456,
+      "step": 66
+    },
+    {
+      "epoch": 0.08294645620550913,
+      "grad_norm": 4.114448070526123,
+      "learning_rate": 0.0002868333228482884,
+      "loss": 19.6216,
+      "step": 67
+    },
+    {
+      "epoch": 0.08418446301454657,
+      "grad_norm": 4.464451313018799,
+      "learning_rate": 0.00028635848504652834,
+      "loss": 20.1096,
+      "step": 68
+    },
+    {
+      "epoch": 0.08542246982358404,
+      "grad_norm": 4.255133628845215,
+      "learning_rate": 0.0002858756910745163,
+      "loss": 19.8431,
+      "step": 69
+    },
+    {
+      "epoch": 0.08666047663262148,
+      "grad_norm": 4.084961891174316,
+      "learning_rate": 0.00028538497258385895,
+      "loss": 20.6091,
+      "step": 70
+    },
+    {
+      "epoch": 0.08789848344165893,
+      "grad_norm": 4.125985622406006,
+      "learning_rate": 0.0002848863617456884,
+      "loss": 20.6725,
+      "step": 71
+    },
+    {
+      "epoch": 0.08913649025069638,
+      "grad_norm": 4.283081531524658,
+      "learning_rate": 0.00028437989124855317,
+      "loss": 20.0569,
+      "step": 72
+    },
+    {
+      "epoch": 0.09037449705973383,
+      "grad_norm": 4.594552040100098,
+      "learning_rate": 0.0002838655942962749,
+      "loss": 20.0793,
+      "step": 73
+    },
+    {
+      "epoch": 0.09161250386877128,
+      "grad_norm": 4.368784427642822,
+      "learning_rate": 0.0002833435046057719,
+      "loss": 20.0819,
+      "step": 74
+    },
+    {
+      "epoch": 0.09285051067780872,
+      "grad_norm": 5.178912162780762,
+      "learning_rate": 0.0002828136564048483,
+      "loss": 19.6422,
+      "step": 75
+    },
+    {
+      "epoch": 0.09285051067780872,
+      "eval_loss": 2.271275043487549,
+      "eval_runtime": 0.4591,
+      "eval_samples_per_second": 108.911,
+      "eval_steps_per_second": 15.248,
+      "step": 75
+    },
+    {
+      "epoch": 0.09408851748684617,
+      "grad_norm": 4.945533752441406,
+      "learning_rate": 0.00028227608442995037,
+      "loss": 19.9913,
+      "step": 76
+    },
+    {
+      "epoch": 0.09532652429588363,
+      "grad_norm": 5.191717624664307,
+      "learning_rate": 0.00028173082392388913,
+      "loss": 19.2061,
+      "step": 77
+    },
+    {
+      "epoch": 0.09656453110492108,
+      "grad_norm": 6.53533411026001,
+      "learning_rate": 0.00028117791063352987,
+      "loss": 21.5617,
+      "step": 78
+    },
+    {
+      "epoch": 0.09780253791395853,
+      "grad_norm": 5.942039489746094,
+      "learning_rate": 0.0002806173808074487,
+      "loss": 20.6797,
+      "step": 79
+    },
+    {
+      "epoch": 0.09904054472299598,
+      "grad_norm": 6.160003185272217,
+      "learning_rate": 0.0002800492711935558,
+      "loss": 19.4544,
+      "step": 80
+    },
+    {
+      "epoch": 0.10027855153203342,
+      "grad_norm": 7.163733005523682,
+      "learning_rate": 0.00027947361903668685,
+      "loss": 20.1732,
+      "step": 81
+    },
+    {
+      "epoch": 0.10151655834107087,
+      "grad_norm": 7.42068338394165,
+      "learning_rate": 0.00027889046207616055,
+      "loss": 19.2081,
+      "step": 82
+    },
+    {
+      "epoch": 0.10275456515010832,
+      "grad_norm": 7.471425533294678,
+      "learning_rate": 0.0002782998385433052,
+      "loss": 20.5223,
+      "step": 83
+    },
+    {
+      "epoch": 0.10399257195914577,
+      "grad_norm": 8.112000465393066,
+      "learning_rate": 0.00027770178715895156,
+      "loss": 21.4008,
+      "step": 84
+    },
+    {
+      "epoch": 0.10523057876818323,
+      "grad_norm": 8.954514503479004,
+      "learning_rate": 0.000277096347130895,
+      "loss": 19.0142,
+      "step": 85
+    },
+    {
+      "epoch": 0.10646858557722068,
+      "grad_norm": 8.809404373168945,
+      "learning_rate": 0.0002764835581513246,
+      "loss": 18.2127,
+      "step": 86
+    },
+    {
+      "epoch": 0.10770659238625813,
+      "grad_norm": 9.259366035461426,
+      "learning_rate": 0.0002758634603942212,
+      "loss": 20.4813,
+      "step": 87
+    },
+    {
+      "epoch": 0.10894459919529557,
+      "grad_norm": 9.49925422668457,
+      "learning_rate": 0.00027523609451272343,
+      "loss": 19.4839,
+      "step": 88
+    },
+    {
+      "epoch": 0.11018260600433302,
+      "grad_norm": 10.327431678771973,
+      "learning_rate": 0.00027460150163646286,
+      "loss": 19.1966,
+      "step": 89
+    },
+    {
+      "epoch": 0.11142061281337047,
+      "grad_norm": 10.627594947814941,
+      "learning_rate": 0.0002739597233688672,
+      "loss": 19.5016,
+      "step": 90
+    },
+    {
+      "epoch": 0.11265861962240792,
+      "grad_norm": 11.462135314941406,
+      "learning_rate": 0.00027331080178443303,
+      "loss": 21.5427,
+      "step": 91
+    },
+    {
+      "epoch": 0.11389662643144537,
+      "grad_norm": 11.9698486328125,
+      "learning_rate": 0.0002726547794259673,
+      "loss": 20.8448,
+      "step": 92
+    },
+    {
+      "epoch": 0.11513463324048283,
+      "grad_norm": 12.26305866241455,
+      "learning_rate": 0.00027199169930179834,
+      "loss": 19.9607,
+      "step": 93
+    },
+    {
+      "epoch": 0.11637264004952028,
+      "grad_norm": 12.922258377075195,
+      "learning_rate": 0.0002713216048829563,
+      "loss": 20.6943,
+      "step": 94
+    },
+    {
+      "epoch": 0.11761064685855772,
+      "grad_norm": 13.900535583496094,
+      "learning_rate": 0.0002706445401003232,
+      "loss": 20.8494,
+      "step": 95
+    },
+    {
+      "epoch": 0.11884865366759517,
+      "grad_norm": 14.97961139678955,
+      "learning_rate": 0.00026996054934175267,
+      "loss": 23.8158,
+      "step": 96
+    },
+    {
+      "epoch": 0.12008666047663262,
+      "grad_norm": 17.248981475830078,
+      "learning_rate": 0.0002692696774491602,
+      "loss": 27.175,
+      "step": 97
+    },
+    {
+      "epoch": 0.12132466728567007,
+      "grad_norm": 23.119956970214844,
+      "learning_rate": 0.00026857196971558306,
+      "loss": 26.2062,
+      "step": 98
+    },
+    {
+      "epoch": 0.12256267409470752,
+      "grad_norm": 21.718097686767578,
+      "learning_rate": 0.00026786747188221145,
+      "loss": 27.1384,
+      "step": 99
+    },
+    {
+      "epoch": 0.12380068090374496,
+      "grad_norm": 39.15816879272461,
+      "learning_rate": 0.00026715623013538883,
+      "loss": 30.5894,
+      "step": 100
+    },
+    {
+      "epoch": 0.12380068090374496,
+      "eval_loss": 2.5429229736328125,
+      "eval_runtime": 0.4584,
+      "eval_samples_per_second": 109.085,
+      "eval_steps_per_second": 15.272,
+      "step": 100
+    },
+    {
+      "epoch": 0.12503868771278243,
+      "grad_norm": 22.390485763549805,
+      "learning_rate": 0.00026643829110358495,
+      "loss": 22.6071,
+      "step": 101
+    },
+    {
+      "epoch": 0.12627669452181986,
+      "grad_norm": 21.849397659301758,
+      "learning_rate": 0.0002657137018543382,
+      "loss": 22.0142,
+      "step": 102
+    },
+    {
+      "epoch": 0.12751470133085732,
+      "grad_norm": 19.895065307617188,
+      "learning_rate": 0.0002649825098911704,
+      "loss": 21.9349,
+      "step": 103
+    },
+    {
+      "epoch": 0.12875270813989476,
+      "grad_norm": 16.992504119873047,
+      "learning_rate": 0.00026424476315047203,
+      "loss": 21.477,
+      "step": 104
+    },
+    {
+      "epoch": 0.12999071494893222,
+      "grad_norm": 12.78968334197998,
+      "learning_rate": 0.0002635005099983601,
+      "loss": 20.9513,
+      "step": 105
+    },
+    {
+      "epoch": 0.13122872175796968,
+      "grad_norm": 10.675745964050293,
+      "learning_rate": 0.0002627497992275069,
+      "loss": 21.1391,
+      "step": 106
+    },
+    {
+      "epoch": 0.1324667285670071,
+      "grad_norm": 8.675626754760742,
+      "learning_rate": 0.0002619926800539412,
+      "loss": 19.64,
+      "step": 107
+    },
+    {
+      "epoch": 0.13370473537604458,
+      "grad_norm": 7.027200222015381,
+      "learning_rate": 0.0002612292021138219,
+      "loss": 20.2259,
+      "step": 108
+    },
+    {
+      "epoch": 0.134942742185082,
+      "grad_norm": 5.689077854156494,
+      "learning_rate": 0.0002604594154601839,
+      "loss": 19.5883,
+      "step": 109
+    },
+    {
+      "epoch": 0.13618074899411947,
+      "grad_norm": 5.175965309143066,
+      "learning_rate": 0.0002596833705596564,
+      "loss": 19.7876,
+      "step": 110
+    },
+    {
+      "epoch": 0.1374187558031569,
+      "grad_norm": 4.827434539794922,
+      "learning_rate": 0.00025890111828915453,
+      "loss": 19.7914,
+      "step": 111
+    },
+    {
+      "epoch": 0.13865676261219437,
+      "grad_norm": 4.572112083435059,
+      "learning_rate": 0.0002581127099325441,
+      "loss": 20.4453,
+      "step": 112
+    },
+    {
+      "epoch": 0.13989476942123183,
+      "grad_norm": 3.9443328380584717,
+      "learning_rate": 0.00025731819717727893,
+      "loss": 19.8542,
+      "step": 113
+    },
+    {
+      "epoch": 0.14113277623026926,
+      "grad_norm": 3.5700809955596924,
+      "learning_rate": 0.0002565176321110129,
+      "loss": 19.1653,
+      "step": 114
+    },
+    {
+      "epoch": 0.14237078303930673,
+      "grad_norm": 3.7322165966033936,
+      "learning_rate": 0.0002557110672181847,
+      "loss": 20.1589,
+      "step": 115
+    },
+    {
+      "epoch": 0.14360878984834416,
+      "grad_norm": 3.796776056289673,
+      "learning_rate": 0.0002548985553765769,
+      "loss": 19.8287,
+      "step": 116
+    },
+    {
+      "epoch": 0.14484679665738162,
+      "grad_norm": 3.7718887329101562,
+      "learning_rate": 0.0002540801498538499,
+      "loss": 19.7224,
+      "step": 117
+    },
+    {
+      "epoch": 0.14608480346641906,
+      "grad_norm": 3.862229824066162,
+      "learning_rate": 0.0002532559043040491,
+      "loss": 19.3835,
+      "step": 118
+    },
+    {
+      "epoch": 0.14732281027545652,
+      "grad_norm": 3.927959442138672,
+      "learning_rate": 0.00025242587276408764,
+      "loss": 19.486,
+      "step": 119
+    },
+    {
+      "epoch": 0.14856081708449395,
+      "grad_norm": 4.238428115844727,
+      "learning_rate": 0.00025159010965020384,
+      "loss": 20.2494,
+      "step": 120
+    },
+    {
+      "epoch": 0.1497988238935314,
+      "grad_norm": 4.240636825561523,
+      "learning_rate": 0.00025074866975439384,
+      "loss": 20.2118,
+      "step": 121
+    },
+    {
+      "epoch": 0.15103683070256887,
+      "grad_norm": 4.528189182281494,
+      "learning_rate": 0.000249901608240819,
+      "loss": 19.49,
+      "step": 122
+    },
+    {
+      "epoch": 0.1522748375116063,
+      "grad_norm": 4.5070366859436035,
+      "learning_rate": 0.0002490489806421898,
+      "loss": 20.4008,
+      "step": 123
+    },
+    {
+      "epoch": 0.15351284432064377,
+      "grad_norm": 4.571038722991943,
+      "learning_rate": 0.0002481908428561252,
+      "loss": 19.7522,
+      "step": 124
+    },
+    {
+      "epoch": 0.1547508511296812,
+      "grad_norm": 4.730791091918945,
+      "learning_rate": 0.00024732725114148753,
+      "loss": 19.8851,
+      "step": 125
+    },
+    {
+      "epoch": 0.1547508511296812,
+      "eval_loss": 2.2168242931365967,
+      "eval_runtime": 0.4591,
+      "eval_samples_per_second": 108.916,
+      "eval_steps_per_second": 15.248,
+      "step": 125
+    },
+    {
+      "epoch": 0.15598885793871867,
+      "grad_norm": 5.251495838165283,
+      "learning_rate": 0.0002464582621146948,
+      "loss": 20.2512,
+      "step": 126
+    },
+    {
+      "epoch": 0.1572268647477561,
+      "grad_norm": 5.254427909851074,
+      "learning_rate": 0.00024558393274600864,
+      "loss": 19.1956,
+      "step": 127
+    },
+    {
+      "epoch": 0.15846487155679356,
+      "grad_norm": 6.146194934844971,
+      "learning_rate": 0.00024470432035579955,
+      "loss": 19.839,
+      "step": 128
+    },
+    {
+      "epoch": 0.15970287836583102,
+      "grad_norm": 6.025669574737549,
+      "learning_rate": 0.00024381948261078862,
+      "loss": 20.0938,
+      "step": 129
+    },
+    {
+      "epoch": 0.16094088517486846,
+      "grad_norm": 6.4785661697387695,
+      "learning_rate": 0.00024292947752026762,
+      "loss": 19.7979,
+      "step": 130
+    },
+    {
+      "epoch": 0.16217889198390592,
+      "grad_norm": 7.562347888946533,
+      "learning_rate": 0.0002420343634322952,
+      "loss": 18.4927,
+      "step": 131
+    },
+    {
+      "epoch": 0.16341689879294335,
+      "grad_norm": 7.990317344665527,
+      "learning_rate": 0.00024113419902987233,
+      "loss": 19.4672,
+      "step": 132
+    },
+    {
+      "epoch": 0.16465490560198082,
+      "grad_norm": 8.579107284545898,
+      "learning_rate": 0.0002402290433270946,
+      "loss": 19.8719,
+      "step": 133
+    },
+    {
+      "epoch": 0.16589291241101825,
+      "grad_norm": 8.502838134765625,
+      "learning_rate": 0.00023931895566528346,
+      "loss": 19.7653,
+      "step": 134
+    },
+    {
+      "epoch": 0.1671309192200557,
+      "grad_norm": 9.949304580688477,
+      "learning_rate": 0.0002384039957090959,
+      "loss": 19.7794,
+      "step": 135
+    },
+    {
+      "epoch": 0.16836892602909315,
+      "grad_norm": 10.154642105102539,
+      "learning_rate": 0.00023748422344261282,
+      "loss": 20.9916,
+      "step": 136
+    },
+    {
+      "epoch": 0.1696069328381306,
+      "grad_norm": 9.841394424438477,
+      "learning_rate": 0.00023655969916540646,
+      "loss": 18.3436,
+      "step": 137
+    },
+    {
+      "epoch": 0.17084493964716807,
+      "grad_norm": 9.700642585754395,
+      "learning_rate": 0.00023563048348858754,
+      "loss": 19.9465,
+      "step": 138
+    },
+    {
+      "epoch": 0.1720829464562055,
+      "grad_norm": 10.298176765441895,
+      "learning_rate": 0.00023469663733083107,
+      "loss": 18.3093,
+      "step": 139
+    },
+    {
+      "epoch": 0.17332095326524297,
+      "grad_norm": 10.373757362365723,
+      "learning_rate": 0.000233758221914383,
+      "loss": 19.5524,
+      "step": 140
+    },
+    {
+      "epoch": 0.1745589600742804,
+      "grad_norm": 10.68146800994873,
+      "learning_rate": 0.00023281529876104646,
+      "loss": 18.4803,
+      "step": 141
+    },
+    {
+      "epoch": 0.17579696688331786,
+      "grad_norm": 10.278772354125977,
+      "learning_rate": 0.00023186792968814835,
+      "loss": 19.8124,
+      "step": 142
+    },
+    {
+      "epoch": 0.1770349736923553,
+      "grad_norm": 11.193350791931152,
+      "learning_rate": 0.00023091617680448662,
+      "loss": 20.4684,
+      "step": 143
+    },
+    {
+      "epoch": 0.17827298050139276,
+      "grad_norm": 11.797956466674805,
+      "learning_rate": 0.0002299601025062587,
+      "loss": 20.1786,
+      "step": 144
+    },
+    {
+      "epoch": 0.17951098731043022,
+      "grad_norm": 15.304258346557617,
+      "learning_rate": 0.00022899976947297047,
+      "loss": 20.4355,
+      "step": 145
+    },
+    {
+      "epoch": 0.18074899411946765,
+      "grad_norm": 14.091426849365234,
+      "learning_rate": 0.00022803524066332745,
+      "loss": 21.2918,
+      "step": 146
+    },
+    {
+      "epoch": 0.18198700092850512,
+      "grad_norm": 19.633708953857422,
+      "learning_rate": 0.000227066579311107,
+      "loss": 21.3633,
+      "step": 147
+    },
+    {
+      "epoch": 0.18322500773754255,
+      "grad_norm": 19.494171142578125,
+      "learning_rate": 0.00022609384892101274,
+      "loss": 24.4366,
+      "step": 148
+    },
+    {
+      "epoch": 0.18446301454658,
+      "grad_norm": 25.900556564331055,
+      "learning_rate": 0.00022511711326451142,
+      "loss": 26.3771,
+      "step": 149
+    },
+    {
+      "epoch": 0.18570102135561745,
+      "grad_norm": 62.00362014770508,
+      "learning_rate": 0.0002241364363756521,
+      "loss": 34.0405,
+      "step": 150
+    },
+    {
+      "epoch": 0.18570102135561745,
+      "eval_loss": 2.284111976623535,
+      "eval_runtime": 0.4592,
+      "eval_samples_per_second": 108.877,
+      "eval_steps_per_second": 15.243,
+      "step": 150
+    },
+    {
+      "epoch": 0.1869390281646549,
+      "grad_norm": 9.669087409973145,
+      "learning_rate": 0.00022315188254686788,
+      "loss": 19.8338,
+      "step": 151
+    },
+    {
+      "epoch": 0.18817703497369234,
+      "grad_norm": 10.625255584716797,
+      "learning_rate": 0.0002221635163247612,
+      "loss": 20.741,
+      "step": 152
+    },
+    {
+      "epoch": 0.1894150417827298,
+      "grad_norm": 9.427986145019531,
+      "learning_rate": 0.00022117140250587202,
+      "loss": 19.8677,
+      "step": 153
+    },
+    {
+      "epoch": 0.19065304859176727,
+      "grad_norm": 8.14100456237793,
+      "learning_rate": 0.00022017560613243008,
+      "loss": 19.6278,
+      "step": 154
+    },
+    {
+      "epoch": 0.1918910554008047,
+      "grad_norm": 6.954777240753174,
+      "learning_rate": 0.00021917619248809043,
+      "loss": 19.8791,
+      "step": 155
+    },
+    {
+      "epoch": 0.19312906220984216,
+      "grad_norm": 5.720154762268066,
+      "learning_rate": 0.00021817322709365372,
+      "loss": 19.9093,
+      "step": 156
+    },
+    {
+      "epoch": 0.1943670690188796,
+      "grad_norm": 5.18485164642334,
+      "learning_rate": 0.00021716677570277073,
+      "loss": 19.9149,
+      "step": 157
+    },
+    {
+      "epoch": 0.19560507582791706,
+      "grad_norm": 5.029271125793457,
+      "learning_rate": 0.00021615690429763141,
+      "loss": 19.3344,
+      "step": 158
+    },
+    {
+      "epoch": 0.1968430826369545,
+      "grad_norm": 4.456918239593506,
+      "learning_rate": 0.0002151436790846392,
+      "loss": 19.3542,
+      "step": 159
+    },
+    {
+      "epoch": 0.19808108944599195,
+      "grad_norm": 4.820105075836182,
+      "learning_rate": 0.00021412716649007083,
+      "loss": 19.3959,
+      "step": 160
+    },
+    {
+      "epoch": 0.19931909625502942,
+      "grad_norm": 4.102832794189453,
+      "learning_rate": 0.00021310743315572104,
+      "loss": 18.9337,
+      "step": 161
+    },
+    {
+      "epoch": 0.20055710306406685,
+      "grad_norm": 3.9710278511047363,
+      "learning_rate": 0.00021208454593453407,
+      "loss": 18.9453,
+      "step": 162
+    },
+    {
+      "epoch": 0.2017951098731043,
+      "grad_norm": 3.7324447631835938,
+      "learning_rate": 0.00021105857188622045,
+      "loss": 19.8246,
+      "step": 163
+    },
+    {
+      "epoch": 0.20303311668214175,
+      "grad_norm": 3.610128402709961,
+      "learning_rate": 0.00021002957827286078,
+      "loss": 19.3761,
+      "step": 164
+    },
+    {
+      "epoch": 0.2042711234911792,
+      "grad_norm": 3.5811269283294678,
+      "learning_rate": 0.000208997632554496,
+      "loss": 19.5489,
+      "step": 165
+    },
+    {
+      "epoch": 0.20550913030021664,
+      "grad_norm": 3.572359561920166,
+      "learning_rate": 0.00020796280238470492,
+      "loss": 19.1723,
+      "step": 166
+    },
+    {
+      "epoch": 0.2067471371092541,
+      "grad_norm": 3.7215442657470703,
+      "learning_rate": 0.00020692515560616869,
+      "loss": 19.5084,
+      "step": 167
+    },
+    {
+      "epoch": 0.20798514391829154,
+      "grad_norm": 3.805196523666382,
+      "learning_rate": 0.00020588476024622332,
+      "loss": 19.4544,
+      "step": 168
+    },
+    {
+      "epoch": 0.209223150727329,
+      "grad_norm": 3.7352330684661865,
+      "learning_rate": 0.00020484168451239968,
+      "loss": 20.023,
+      "step": 169
+    },
+    {
+      "epoch": 0.21046115753636646,
+      "grad_norm": 3.7461116313934326,
+      "learning_rate": 0.0002037959967879518,
+      "loss": 19.1615,
+      "step": 170
+    },
+    {
+      "epoch": 0.2116991643454039,
+      "grad_norm": 4.025320053100586,
+      "learning_rate": 0.00020274776562737397,
+      "loss": 20.734,
+      "step": 171
+    },
+    {
+      "epoch": 0.21293717115444136,
+      "grad_norm": 4.387618541717529,
+      "learning_rate": 0.00020169705975190628,
+      "loss": 20.0918,
+      "step": 172
+    },
+    {
+      "epoch": 0.2141751779634788,
+      "grad_norm": 4.41184139251709,
+      "learning_rate": 0.00020064394804502903,
+      "loss": 19.2698,
+      "step": 173
+    },
+    {
+      "epoch": 0.21541318477251625,
+      "grad_norm": 4.930720329284668,
+      "learning_rate": 0.0001995884995479472,
+      "loss": 21.0811,
+      "step": 174
+    },
+    {
+      "epoch": 0.2166511915815537,
+      "grad_norm": 4.822671890258789,
+      "learning_rate": 0.000198530783455064,
+      "loss": 19.6059,
+      "step": 175
+    },
+    {
+      "epoch": 0.2166511915815537,
+      "eval_loss": 2.180332660675049,
+      "eval_runtime": 0.4585,
+      "eval_samples_per_second": 109.043,
+      "eval_steps_per_second": 15.266,
+      "step": 175
+    },
+    {
+      "epoch": 0.21788919839059115,
+      "grad_norm": 5.071719169616699,
+      "learning_rate": 0.00019747086910944423,
+      "loss": 20.1044,
+      "step": 176
+    },
+    {
+      "epoch": 0.2191272051996286,
+      "grad_norm": 5.194329261779785,
+      "learning_rate": 0.0001964088259982687,
+      "loss": 19.0575,
+      "step": 177
+    },
+    {
+      "epoch": 0.22036521200866604,
+      "grad_norm": 5.622753620147705,
+      "learning_rate": 0.00019534472374827845,
+      "loss": 19.2182,
+      "step": 178
+    },
+    {
+      "epoch": 0.2216032188177035,
+      "grad_norm": 6.169511318206787,
+      "learning_rate": 0.00019427863212121004,
+      "loss": 19.3218,
+      "step": 179
+    },
+    {
+      "epoch": 0.22284122562674094,
+      "grad_norm": 6.637182712554932,
+      "learning_rate": 0.00019321062100922213,
+      "loss": 18.3712,
+      "step": 180
+    },
+    {
+      "epoch": 0.2240792324357784,
+      "grad_norm": 8.203030586242676,
+      "learning_rate": 0.0001921407604303133,
+      "loss": 18.8138,
+      "step": 181
+    },
+    {
+      "epoch": 0.22531723924481584,
+      "grad_norm": 7.009521961212158,
+      "learning_rate": 0.00019106912052373187,
+      "loss": 18.5025,
+      "step": 182
+    },
+    {
+      "epoch": 0.2265552460538533,
+      "grad_norm": 8.01633358001709,
+      "learning_rate": 0.00018999577154537743,
+      "loss": 19.9148,
+      "step": 183
+    },
+    {
+      "epoch": 0.22779325286289073,
+      "grad_norm": 7.781541347503662,
+      "learning_rate": 0.00018892078386319508,
+      "loss": 19.228,
+      "step": 184
+    },
+    {
+      "epoch": 0.2290312596719282,
+      "grad_norm": 8.206415176391602,
+      "learning_rate": 0.00018784422795256207,
+      "loss": 18.9477,
+      "step": 185
+    },
+    {
+      "epoch": 0.23026926648096566,
+      "grad_norm": 8.606698036193848,
+      "learning_rate": 0.00018676617439166755,
+      "loss": 18.1439,
+      "step": 186
+    },
+    {
+      "epoch": 0.2315072732900031,
+      "grad_norm": 9.17394733428955,
+      "learning_rate": 0.0001856866938568855,
+      "loss": 19.071,
+      "step": 187
+    },
+    {
+      "epoch": 0.23274528009904055,
+      "grad_norm": 9.171770095825195,
+      "learning_rate": 0.0001846058571181412,
+      "loss": 18.9195,
+      "step": 188
+    },
+    {
+      "epoch": 0.233983286908078,
+      "grad_norm": 9.193241119384766,
+      "learning_rate": 0.00018352373503427156,
+      "loss": 18.7589,
+      "step": 189
+    },
+    {
+      "epoch": 0.23522129371711545,
+      "grad_norm": 9.655874252319336,
+      "learning_rate": 0.00018244039854837984,
+      "loss": 20.2001,
+      "step": 190
+    },
+    {
+      "epoch": 0.23645930052615288,
+      "grad_norm": 10.24439525604248,
+      "learning_rate": 0.00018135591868318458,
+      "loss": 17.9239,
+      "step": 191
+    },
+    {
+      "epoch": 0.23769730733519034,
+      "grad_norm": 10.944853782653809,
+      "learning_rate": 0.0001802703665363634,
+      "loss": 16.8519,
+      "step": 192
+    },
+    {
+      "epoch": 0.2389353141442278,
+      "grad_norm": 12.930046081542969,
+      "learning_rate": 0.00017918381327589172,
+      "loss": 22.0313,
+      "step": 193
+    },
+    {
+      "epoch": 0.24017332095326524,
+      "grad_norm": 12.4637451171875,
+      "learning_rate": 0.0001780963301353775,
+      "loss": 19.3518,
+      "step": 194
+    },
+    {
+      "epoch": 0.2414113277623027,
+      "grad_norm": 14.22148609161377,
+      "learning_rate": 0.00017700798840939068,
+      "loss": 20.3325,
+      "step": 195
+    },
+    {
+      "epoch": 0.24264933457134014,
+      "grad_norm": 14.300663948059082,
+      "learning_rate": 0.0001759188594487896,
+      "loss": 22.5563,
+      "step": 196
+    },
+    {
+      "epoch": 0.2438873413803776,
+      "grad_norm": 17.390811920166016,
+      "learning_rate": 0.0001748290146560429,
+      "loss": 22.2467,
+      "step": 197
+    },
+    {
+      "epoch": 0.24512534818941503,
+      "grad_norm": 19.233295440673828,
+      "learning_rate": 0.00017373852548054883,
+      "loss": 23.0156,
+      "step": 198
+    },
+    {
+      "epoch": 0.2463633549984525,
+      "grad_norm": 24.415042877197266,
+      "learning_rate": 0.00017264746341395077,
+      "loss": 25.5748,
+      "step": 199
+    },
+    {
+      "epoch": 0.24760136180748993,
+      "grad_norm": 46.97291946411133,
+      "learning_rate": 0.0001715558999854505,
+      "loss": 30.3984,
+      "step": 200
+    },
+    {
+      "epoch": 0.24760136180748993,
+      "eval_loss": 2.212056875228882,
+      "eval_runtime": 0.4597,
+      "eval_samples_per_second": 108.771,
+      "eval_steps_per_second": 15.228,
+      "step": 200
+    },
+    {
+      "epoch": 0.2488393686165274,
+      "grad_norm": 5.79608678817749,
+      "learning_rate": 0.00017046390675711862,
+      "loss": 18.3087,
+      "step": 201
+    },
+    {
+      "epoch": 0.25007737542556485,
+      "grad_norm": 6.862473964691162,
+      "learning_rate": 0.00016937155531920306,
+      "loss": 19.5213,
+      "step": 202
+    },
+    {
+      "epoch": 0.2513153822346023,
+      "grad_norm": 7.139790058135986,
+      "learning_rate": 0.00016827891728543576,
+      "loss": 19.5547,
+      "step": 203
+    },
+    {
+      "epoch": 0.2525533890436397,
+      "grad_norm": 6.667609691619873,
+      "learning_rate": 0.00016718606428833772,
+      "loss": 18.5032,
+      "step": 204
+    },
+    {
+      "epoch": 0.2537913958526772,
+      "grad_norm": 6.459772109985352,
+      "learning_rate": 0.0001660930679745226,
+      "loss": 19.4291,
+      "step": 205
+    },
+    {
+      "epoch": 0.25502940266171464,
+      "grad_norm": 5.822205543518066,
+      "learning_rate": 0.000165,
+      "loss": 19.1632,
+      "step": 206
+    },
+    {
+      "epoch": 0.2562674094707521,
+      "grad_norm": 4.656381130218506,
+      "learning_rate": 0.0001639069320254774,
+      "loss": 19.6671,
+      "step": 207
+    },
+    {
+      "epoch": 0.2575054162797895,
+      "grad_norm": 4.40847635269165,
+      "learning_rate": 0.00016281393571166228,
+      "loss": 19.0738,
+      "step": 208
+    },
+    {
+      "epoch": 0.258743423088827,
+      "grad_norm": 3.990798234939575,
+      "learning_rate": 0.00016172108271456423,
+      "loss": 18.6036,
+      "step": 209
+    },
+    {
+      "epoch": 0.25998142989786444,
+      "grad_norm": 4.128664970397949,
+      "learning_rate": 0.000160628444680797,
+      "loss": 19.9571,
+      "step": 210
+    },
+    {
+      "epoch": 0.2612194367069019,
+      "grad_norm": 3.9195494651794434,
+      "learning_rate": 0.00015953609324288143,
+      "loss": 19.4763,
+      "step": 211
+    },
+    {
+      "epoch": 0.26245744351593936,
+      "grad_norm": 3.8015336990356445,
+      "learning_rate": 0.00015844410001454953,
+      "loss": 18.8857,
+      "step": 212
+    },
+    {
+      "epoch": 0.26369545032497677,
+      "grad_norm": 3.763514518737793,
+      "learning_rate": 0.00015735253658604917,
+      "loss": 20.0241,
+      "step": 213
+    },
+    {
+      "epoch": 0.2649334571340142,
+      "grad_norm": 3.5288500785827637,
+      "learning_rate": 0.00015626147451945117,
+      "loss": 19.3841,
+      "step": 214
+    },
+    {
+      "epoch": 0.2661714639430517,
+      "grad_norm": 3.7764415740966797,
+      "learning_rate": 0.0001551709853439571,
+      "loss": 20.7493,
+      "step": 215
+    },
+    {
+      "epoch": 0.26740947075208915,
+      "grad_norm": 3.6105055809020996,
+      "learning_rate": 0.00015408114055121046,
+      "loss": 19.8769,
+      "step": 216
+    },
+    {
+      "epoch": 0.2686474775611266,
+      "grad_norm": 3.764777898788452,
+      "learning_rate": 0.00015299201159060932,
+      "loss": 19.1438,
+      "step": 217
+    },
+    {
+      "epoch": 0.269885484370164,
+      "grad_norm": 3.5672457218170166,
+      "learning_rate": 0.0001519036698646225,
+      "loss": 19.7251,
+      "step": 218
+    },
+    {
+      "epoch": 0.2711234911792015,
+      "grad_norm": 3.7739570140838623,
+      "learning_rate": 0.00015081618672410828,
+      "loss": 19.2045,
+      "step": 219
+    },
+    {
+      "epoch": 0.27236149798823894,
+      "grad_norm": 4.052594184875488,
+      "learning_rate": 0.0001497296334636366,
+      "loss": 20.0188,
+      "step": 220
+    },
+    {
+      "epoch": 0.2735995047972764,
+      "grad_norm": 4.5943498611450195,
+      "learning_rate": 0.00014864408131681542,
+      "loss": 19.5231,
+      "step": 221
+    },
+    {
+      "epoch": 0.2748375116063138,
+      "grad_norm": 4.334908485412598,
+      "learning_rate": 0.00014755960145162016,
+      "loss": 19.7947,
+      "step": 222
+    },
+    {
+      "epoch": 0.2760755184153513,
+      "grad_norm": 4.427191734313965,
+      "learning_rate": 0.00014647626496572846,
+      "loss": 20.0526,
+      "step": 223
+    },
+    {
+      "epoch": 0.27731352522438873,
+      "grad_norm": 4.512222766876221,
+      "learning_rate": 0.00014539414288185882,
+      "loss": 19.0376,
+      "step": 224
+    },
+    {
+      "epoch": 0.2785515320334262,
+      "grad_norm": 4.728100299835205,
+      "learning_rate": 0.00014431330614311447,
+      "loss": 20.3587,
+      "step": 225
+    },
+    {
+      "epoch": 0.2785515320334262,
+      "eval_loss": 2.1677565574645996,
+      "eval_runtime": 0.4594,
+      "eval_samples_per_second": 108.838,
+      "eval_steps_per_second": 15.237,
+      "step": 225
+    },
+    {
+      "epoch": 0.27978953884246366,
+      "grad_norm": 5.296298980712891,
+      "learning_rate": 0.00014323382560833242,
+      "loss": 19.9326,
+      "step": 226
+    },
+    {
+      "epoch": 0.28102754565150107,
+      "grad_norm": 5.252712249755859,
+      "learning_rate": 0.00014215577204743793,
+      "loss": 19.731,
+      "step": 227
+    },
+    {
+      "epoch": 0.2822655524605385,
+      "grad_norm": 6.090585231781006,
+      "learning_rate": 0.00014107921613680491,
+      "loss": 18.9676,
+      "step": 228
+    },
+    {
+      "epoch": 0.283503559269576,
+      "grad_norm": 6.460826396942139,
+      "learning_rate": 0.00014000422845462257,
+      "loss": 19.2938,
+      "step": 229
+    },
+    {
+      "epoch": 0.28474156607861345,
+      "grad_norm": 6.747186183929443,
+      "learning_rate": 0.00013893087947626812,
+      "loss": 18.653,
+      "step": 230
+    },
+    {
+      "epoch": 0.28597957288765086,
+      "grad_norm": 6.91395378112793,
+      "learning_rate": 0.00013785923956968668,
+      "loss": 18.0622,
+      "step": 231
+    },
+    {
+      "epoch": 0.2872175796966883,
+      "grad_norm": 9.03824234008789,
+      "learning_rate": 0.00013678937899077787,
+      "loss": 18.5236,
+      "step": 232
+    },
+    {
+      "epoch": 0.2884555865057258,
+      "grad_norm": 8.007859230041504,
+      "learning_rate": 0.00013572136787878995,
+      "loss": 18.304,
+      "step": 233
+    },
+    {
+      "epoch": 0.28969359331476324,
+      "grad_norm": 7.938798427581787,
+      "learning_rate": 0.00013465527625172158,
+      "loss": 16.7488,
+      "step": 234
+    },
+    {
+      "epoch": 0.2909316001238007,
+      "grad_norm": 8.674590110778809,
+      "learning_rate": 0.0001335911740017313,
+      "loss": 18.2731,
+      "step": 235
+    },
+    {
+      "epoch": 0.2921696069328381,
+      "grad_norm": 8.993042945861816,
+      "learning_rate": 0.0001325291308905558,
+      "loss": 20.209,
+      "step": 236
+    },
+    {
+      "epoch": 0.2934076137418756,
+      "grad_norm": 10.559680938720703,
+      "learning_rate": 0.00013146921654493598,
+      "loss": 19.7907,
+      "step": 237
+    },
+    {
+      "epoch": 0.29464562055091303,
+      "grad_norm": 9.593064308166504,
+      "learning_rate": 0.00013041150045205272,
+      "loss": 18.3169,
+      "step": 238
+    },
+    {
+      "epoch": 0.2958836273599505,
+      "grad_norm": 11.276885986328125,
+      "learning_rate": 0.00012935605195497094,
+      "loss": 20.2917,
+      "step": 239
+    },
+    {
+      "epoch": 0.2971216341689879,
+      "grad_norm": 11.100722312927246,
+      "learning_rate": 0.00012830294024809372,
+      "loss": 18.25,
+      "step": 240
+    },
+    {
+      "epoch": 0.29835964097802536,
+      "grad_norm": 11.526241302490234,
+      "learning_rate": 0.000127252234372626,
+      "loss": 19.8701,
+      "step": 241
+    },
+    {
+      "epoch": 0.2995976477870628,
+      "grad_norm": 10.755220413208008,
+      "learning_rate": 0.0001262040032120482,
+      "loss": 16.2828,
+      "step": 242
+    },
+    {
+      "epoch": 0.3008356545961003,
+      "grad_norm": 13.15343189239502,
+      "learning_rate": 0.00012515831548760031,
+      "loss": 18.6229,
+      "step": 243
+    },
+    {
+      "epoch": 0.30207366140513775,
+      "grad_norm": 12.657286643981934,
+      "learning_rate": 0.00012411523975377667,
+      "loss": 19.268,
+      "step": 244
+    },
+    {
+      "epoch": 0.30331166821417516,
+      "grad_norm": 13.56824016571045,
+      "learning_rate": 0.0001230748443938313,
+      "loss": 19.3271,
+      "step": 245
+    },
+    {
+      "epoch": 0.3045496750232126,
+      "grad_norm": 14.00251579284668,
+      "learning_rate": 0.00012203719761529511,
+      "loss": 22.5264,
+      "step": 246
+    },
+    {
+      "epoch": 0.3057876818322501,
+      "grad_norm": 15.027515411376953,
+      "learning_rate": 0.00012100236744550403,
+      "loss": 18.8906,
+      "step": 247
+    },
+    {
+      "epoch": 0.30702568864128754,
+      "grad_norm": 18.13871192932129,
+      "learning_rate": 0.00011997042172713925,
+      "loss": 23.2494,
+      "step": 248
+    },
+    {
+      "epoch": 0.308263695450325,
+      "grad_norm": 21.671537399291992,
+      "learning_rate": 0.00011894142811377952,
+      "loss": 24.1794,
+      "step": 249
+    },
+    {
+      "epoch": 0.3095017022593624,
+      "grad_norm": 41.55762481689453,
+      "learning_rate": 0.00011791545406546589,
+      "loss": 30.4137,
+      "step": 250
+    },
+    {
+      "epoch": 0.3095017022593624,
+      "eval_loss": 2.245269536972046,
+      "eval_runtime": 0.4583,
+      "eval_samples_per_second": 109.091,
+      "eval_steps_per_second": 15.273,
+      "step": 250
+    },
+    {
+      "epoch": 0.31073970906839987,
+      "grad_norm": 5.071318626403809,
+      "learning_rate": 0.0001168925668442789,
+      "loss": 18.5849,
+      "step": 251
+    },
+    {
+      "epoch": 0.31197771587743733,
+      "grad_norm": 5.750345230102539,
+      "learning_rate": 0.00011587283350992917,
+      "loss": 19.3662,
+      "step": 252
+    },
+    {
+      "epoch": 0.3132157226864748,
+      "grad_norm": 6.307399272918701,
+      "learning_rate": 0.00011485632091536079,
+      "loss": 18.9768,
+      "step": 253
+    },
+    {
+      "epoch": 0.3144537294955122,
+      "grad_norm": 5.8165483474731445,
+      "learning_rate": 0.0001138430957023686,
+      "loss": 18.3872,
+      "step": 254
+    },
+    {
+      "epoch": 0.31569173630454966,
+      "grad_norm": 5.609121799468994,
+      "learning_rate": 0.00011283322429722926,
+      "loss": 18.7267,
+      "step": 255
+    },
+    {
+      "epoch": 0.3169297431135871,
+      "grad_norm": 5.074406623840332,
+      "learning_rate": 0.00011182677290634626,
+      "loss": 19.2954,
+      "step": 256
+    },
+    {
+      "epoch": 0.3181677499226246,
+      "grad_norm": 4.583287239074707,
+      "learning_rate": 0.00011082380751190957,
+      "loss": 18.8282,
+      "step": 257
+    },
+    {
+      "epoch": 0.31940575673166205,
+      "grad_norm": 4.339077949523926,
+      "learning_rate": 0.00010982439386756993,
+      "loss": 19.5454,
+      "step": 258
+    },
+    {
+      "epoch": 0.32064376354069946,
+      "grad_norm": 3.788411855697632,
+      "learning_rate": 0.000108828597494128,
+      "loss": 19.3046,
+      "step": 259
+    },
+    {
+      "epoch": 0.3218817703497369,
+      "grad_norm": 3.411381959915161,
+      "learning_rate": 0.00010783648367523887,
+      "loss": 18.6118,
+      "step": 260
+    },
+    {
+      "epoch": 0.3231197771587744,
+      "grad_norm": 3.3583920001983643,
+      "learning_rate": 0.00010684811745313209,
+      "loss": 18.6945,
+      "step": 261
+    },
+    {
+      "epoch": 0.32435778396781184,
+      "grad_norm": 3.497206926345825,
+      "learning_rate": 0.00010586356362434786,
+      "loss": 18.9166,
+      "step": 262
+    },
+    {
+      "epoch": 0.32559579077684925,
+      "grad_norm": 3.517178535461426,
+      "learning_rate": 0.00010488288673548855,
+      "loss": 19.1632,
+      "step": 263
+    },
+    {
+      "epoch": 0.3268337975858867,
+      "grad_norm": 3.687941074371338,
+      "learning_rate": 0.00010390615107898727,
+      "loss": 19.392,
+      "step": 264
+    },
+    {
+      "epoch": 0.32807180439492417,
+      "grad_norm": 3.892406463623047,
+      "learning_rate": 0.00010293342068889302,
+      "loss": 19.071,
+      "step": 265
+    },
+    {
+      "epoch": 0.32930981120396163,
+      "grad_norm": 3.858397960662842,
+      "learning_rate": 0.00010196475933667252,
+      "loss": 19.042,
+      "step": 266
+    },
+    {
+      "epoch": 0.3305478180129991,
+      "grad_norm": 3.9651567935943604,
+      "learning_rate": 0.00010100023052702953,
+      "loss": 19.4187,
+      "step": 267
+    },
+    {
+      "epoch": 0.3317858248220365,
+      "grad_norm": 3.9167113304138184,
+      "learning_rate": 0.00010003989749374132,
+      "loss": 18.5675,
+      "step": 268
+    },
+    {
+      "epoch": 0.33302383163107396,
+      "grad_norm": 3.9492886066436768,
+      "learning_rate": 9.90838231955134e-05,
+      "loss": 20.0447,
+      "step": 269
+    },
+    {
+      "epoch": 0.3342618384401114,
+      "grad_norm": 4.153578281402588,
+      "learning_rate": 9.813207031185173e-05,
+      "loss": 19.7747,
+      "step": 270
+    },
+    {
+      "epoch": 0.3354998452491489,
+      "grad_norm": 4.268838405609131,
+      "learning_rate": 9.718470123895357e-05,
+      "loss": 19.0648,
+      "step": 271
+    },
+    {
+      "epoch": 0.3367378520581863,
+      "grad_norm": 4.0107951164245605,
+      "learning_rate": 9.624177808561703e-05,
+      "loss": 19.7128,
+      "step": 272
+    },
+    {
+      "epoch": 0.33797585886722376,
+      "grad_norm": 4.586495876312256,
+      "learning_rate": 9.53033626691689e-05,
+      "loss": 19.7837,
+      "step": 273
+    },
+    {
+      "epoch": 0.3392138656762612,
+      "grad_norm": 4.765363693237305,
+      "learning_rate": 9.436951651141242e-05,
+      "loss": 19.4179,
+      "step": 274
+    },
+    {
+      "epoch": 0.3404518724852987,
+      "grad_norm": 4.7433648109436035,
+      "learning_rate": 9.34403008345935e-05,
+      "loss": 20.4525,
+      "step": 275
+    },
+    {
+      "epoch": 0.3404518724852987,
+      "eval_loss": 2.149083137512207,
+      "eval_runtime": 0.4596,
+      "eval_samples_per_second": 108.779,
+      "eval_steps_per_second": 15.229,
+      "step": 275
+    },
+    {
+      "epoch": 0.34168987929433614,
+      "grad_norm": 5.287757873535156,
+      "learning_rate": 9.251577655738719e-05,
+      "loss": 19.0087,
+      "step": 276
+    },
+    {
+      "epoch": 0.34292788610337355,
+      "grad_norm": 5.111241817474365,
+      "learning_rate": 9.159600429090411e-05,
+      "loss": 20.1982,
+      "step": 277
+    },
+    {
+      "epoch": 0.344165892912411,
+      "grad_norm": 5.750723838806152,
+      "learning_rate": 9.068104433471652e-05,
+      "loss": 19.1619,
+      "step": 278
+    },
+    {
+      "epoch": 0.34540389972144847,
+      "grad_norm": 5.881593704223633,
+      "learning_rate": 8.97709566729054e-05,
+      "loss": 18.5968,
+      "step": 279
+    },
+    {
+      "epoch": 0.34664190653048593,
+      "grad_norm": 6.350897789001465,
+      "learning_rate": 8.886580097012762e-05,
+      "loss": 18.4535,
+      "step": 280
+    },
+    {
+      "epoch": 0.3478799133395234,
+      "grad_norm": 7.2103095054626465,
+      "learning_rate": 8.796563656770475e-05,
+      "loss": 18.0141,
+      "step": 281
+    },
+    {
+      "epoch": 0.3491179201485608,
+      "grad_norm": 8.218284606933594,
+      "learning_rate": 8.70705224797324e-05,
+      "loss": 19.1382,
+      "step": 282
+    },
+    {
+      "epoch": 0.35035592695759826,
+      "grad_norm": 8.265275955200195,
+      "learning_rate": 8.618051738921134e-05,
+      "loss": 18.2249,
+      "step": 283
+    },
+    {
+      "epoch": 0.3515939337666357,
+      "grad_norm": 8.471662521362305,
+      "learning_rate": 8.529567964420047e-05,
+      "loss": 18.3772,
+      "step": 284
+    },
+    {
+      "epoch": 0.3528319405756732,
+      "grad_norm": 9.499984741210938,
+      "learning_rate": 8.44160672539913e-05,
+      "loss": 19.3033,
+      "step": 285
+    },
+    {
+      "epoch": 0.3540699473847106,
+      "grad_norm": 9.403618812561035,
+      "learning_rate": 8.354173788530516e-05,
+      "loss": 18.1899,
+      "step": 286
+    },
+    {
+      "epoch": 0.35530795419374805,
+      "grad_norm": 9.960768699645996,
+      "learning_rate": 8.267274885851248e-05,
+      "loss": 17.1351,
+      "step": 287
+    },
+    {
+      "epoch": 0.3565459610027855,
+      "grad_norm": 10.338618278503418,
+      "learning_rate": 8.180915714387479e-05,
+      "loss": 19.4274,
+      "step": 288
+    },
+    {
+      "epoch": 0.357783967811823,
+      "grad_norm": 13.337356567382812,
+      "learning_rate": 8.095101935781016e-05,
+      "loss": 21.5455,
+      "step": 289
+    },
+    {
+      "epoch": 0.35902197462086044,
+      "grad_norm": 11.105886459350586,
+      "learning_rate": 8.009839175918098e-05,
+      "loss": 19.1035,
+      "step": 290
+    },
+    {
+      "epoch": 0.36025998142989785,
+      "grad_norm": 10.98268985748291,
+      "learning_rate": 7.925133024560616e-05,
+      "loss": 19.5065,
+      "step": 291
+    },
+    {
+      "epoch": 0.3614979882389353,
+      "grad_norm": 13.637511253356934,
+      "learning_rate": 7.840989034979613e-05,
+      "loss": 20.3372,
+      "step": 292
+    },
+    {
+      "epoch": 0.36273599504797277,
+      "grad_norm": 11.745948791503906,
+      "learning_rate": 7.75741272359124e-05,
+      "loss": 18.4544,
+      "step": 293
+    },
+    {
+      "epoch": 0.36397400185701023,
+      "grad_norm": 12.553361892700195,
+      "learning_rate": 7.674409569595094e-05,
+      "loss": 19.3569,
+      "step": 294
+    },
+    {
+      "epoch": 0.36521200866604764,
+      "grad_norm": 14.240659713745117,
+      "learning_rate": 7.591985014615007e-05,
+      "loss": 19.9098,
+      "step": 295
+    },
+    {
+      "epoch": 0.3664500154750851,
+      "grad_norm": 13.349631309509277,
+      "learning_rate": 7.510144462342307e-05,
+      "loss": 18.9005,
+      "step": 296
+    },
+    {
+      "epoch": 0.36768802228412256,
+      "grad_norm": 17.52948570251465,
+      "learning_rate": 7.428893278181532e-05,
+      "loss": 22.9618,
+      "step": 297
+    },
+    {
+      "epoch": 0.36892602909316,
+      "grad_norm": 19.178434371948242,
+      "learning_rate": 7.348236788898705e-05,
+      "loss": 24.8635,
+      "step": 298
+    },
+    {
+      "epoch": 0.3701640359021975,
+      "grad_norm": 24.319643020629883,
+      "learning_rate": 7.268180282272107e-05,
+      "loss": 26.7912,
+      "step": 299
+    },
+    {
+      "epoch": 0.3714020427112349,
+      "grad_norm": 41.898406982421875,
+      "learning_rate": 7.188729006745592e-05,
+      "loss": 29.6048,
+      "step": 300
+    },
+    {
+      "epoch": 0.3714020427112349,
+      "eval_loss": 2.1738827228546143,
+      "eval_runtime": 0.4588,
+      "eval_samples_per_second": 108.983,
+      "eval_steps_per_second": 15.258,
+      "step": 300
+    },
+    {
+      "epoch": 0.37264004952027235,
+      "grad_norm": 3.581990957260132,
+      "learning_rate": 7.109888171084546e-05,
+      "loss": 18.1553,
+      "step": 301
+    },
+    {
+      "epoch": 0.3738780563293098,
+      "grad_norm": 3.915548801422119,
+      "learning_rate": 7.03166294403436e-05,
+      "loss": 18.6218,
+      "step": 302
+    },
+    {
+      "epoch": 0.3751160631383473,
+      "grad_norm": 4.525240898132324,
+      "learning_rate": 6.954058453981609e-05,
+      "loss": 19.0451,
+      "step": 303
+    },
+    {
+      "epoch": 0.3763540699473847,
+      "grad_norm": 4.901094436645508,
+      "learning_rate": 6.877079788617809e-05,
+      "loss": 18.4888,
+      "step": 304
+    },
+    {
+      "epoch": 0.37759207675642215,
+      "grad_norm": 4.8025360107421875,
+      "learning_rate": 6.80073199460588e-05,
+      "loss": 18.6165,
+      "step": 305
+    },
+    {
+      "epoch": 0.3788300835654596,
+      "grad_norm": 4.818271160125732,
+      "learning_rate": 6.725020077249312e-05,
+      "loss": 18.7518,
+      "step": 306
+    },
+    {
+      "epoch": 0.38006809037449707,
+      "grad_norm": 4.015491485595703,
+      "learning_rate": 6.649949000163985e-05,
+      "loss": 19.3923,
+      "step": 307
+    },
+    {
+      "epoch": 0.38130609718353453,
+      "grad_norm": 4.116284370422363,
+      "learning_rate": 6.575523684952798e-05,
+      "loss": 19.1145,
+      "step": 308
+    },
+    {
+      "epoch": 0.38254410399257194,
+      "grad_norm": 3.7605159282684326,
+      "learning_rate": 6.501749010882962e-05,
+      "loss": 19.3626,
+      "step": 309
+    },
+    {
+      "epoch": 0.3837821108016094,
+      "grad_norm": 3.646667718887329,
+      "learning_rate": 6.428629814566178e-05,
+      "loss": 18.8842,
+      "step": 310
+    },
+    {
+      "epoch": 0.38502011761064686,
+      "grad_norm": 3.4646224975585938,
+      "learning_rate": 6.356170889641505e-05,
+      "loss": 18.0124,
+      "step": 311
+    },
+    {
+      "epoch": 0.3862581244196843,
+      "grad_norm": 3.335514545440674,
+      "learning_rate": 6.284376986461113e-05,
+      "loss": 19.1248,
+      "step": 312
+    },
+    {
+      "epoch": 0.3874961312287218,
+      "grad_norm": 3.5872344970703125,
+      "learning_rate": 6.213252811778856e-05,
+      "loss": 19.1753,
+      "step": 313
+    },
+    {
+      "epoch": 0.3887341380377592,
+      "grad_norm": 3.3530263900756836,
+      "learning_rate": 6.142803028441687e-05,
+      "loss": 18.9668,
+      "step": 314
+    },
+    {
+      "epoch": 0.38997214484679665,
+      "grad_norm": 3.4116427898406982,
+      "learning_rate": 6.073032255083983e-05,
+      "loss": 19.3431,
+      "step": 315
+    },
+    {
+      "epoch": 0.3912101516558341,
+      "grad_norm": 3.7365005016326904,
+      "learning_rate": 6.003945065824737e-05,
+      "loss": 18.7074,
+      "step": 316
+    },
+    {
+      "epoch": 0.3924481584648716,
+      "grad_norm": 3.765794515609741,
+      "learning_rate": 5.935545989967681e-05,
+      "loss": 19.5816,
+      "step": 317
+    },
+    {
+      "epoch": 0.393686165273909,
+      "grad_norm": 4.13867712020874,
+      "learning_rate": 5.867839511704368e-05,
+      "loss": 19.3254,
+      "step": 318
+    },
+    {
+      "epoch": 0.39492417208294645,
+      "grad_norm": 4.177253723144531,
+      "learning_rate": 5.800830069820163e-05,
+      "loss": 20.1147,
+      "step": 319
+    },
+    {
+      "epoch": 0.3961621788919839,
+      "grad_norm": 4.264730930328369,
+      "learning_rate": 5.734522057403271e-05,
+      "loss": 19.2077,
+      "step": 320
+    },
+    {
+      "epoch": 0.39740018570102137,
+      "grad_norm": 4.475826263427734,
+      "learning_rate": 5.668919821556695e-05,
+      "loss": 19.8458,
+      "step": 321
+    },
+    {
+      "epoch": 0.39863819251005883,
+      "grad_norm": 4.821839809417725,
+      "learning_rate": 5.604027663113273e-05,
+      "loss": 19.9184,
+      "step": 322
+    },
+    {
+      "epoch": 0.39987619931909624,
+      "grad_norm": 5.058393478393555,
+      "learning_rate": 5.539849836353708e-05,
+      "loss": 19.6655,
+      "step": 323
+    },
+    {
+      "epoch": 0.4011142061281337,
+      "grad_norm": 5.279305458068848,
+      "learning_rate": 5.4763905487276506e-05,
+      "loss": 19.4492,
+      "step": 324
+    },
+    {
+      "epoch": 0.40235221293717116,
+      "grad_norm": 5.956630229949951,
+      "learning_rate": 5.413653960577879e-05,
+      "loss": 18.8365,
+      "step": 325
+    },
+    {
+      "epoch": 0.40235221293717116,
+      "eval_loss": 2.1857447624206543,
+      "eval_runtime": 0.4564,
+      "eval_samples_per_second": 109.564,
+      "eval_steps_per_second": 15.339,
+      "step": 325
+    },
+    {
+      "epoch": 0.4035902197462086,
+      "grad_norm": 6.074418067932129,
+      "learning_rate": 5.3516441848675385e-05,
+      "loss": 18.2903,
+      "step": 326
+    },
+    {
+      "epoch": 0.40482822655524603,
+      "grad_norm": 6.494494438171387,
+      "learning_rate": 5.290365286910496e-05,
+      "loss": 19.7135,
+      "step": 327
+    },
+    {
+      "epoch": 0.4060662333642835,
+      "grad_norm": 6.426371097564697,
+      "learning_rate": 5.229821284104842e-05,
+      "loss": 19.7109,
+      "step": 328
+    },
+    {
+      "epoch": 0.40730424017332095,
+      "grad_norm": 7.912612438201904,
+      "learning_rate": 5.1700161456694785e-05,
+      "loss": 17.9351,
+      "step": 329
+    },
+    {
+      "epoch": 0.4085422469823584,
+      "grad_norm": 7.362687110900879,
+      "learning_rate": 5.110953792383941e-05,
+      "loss": 18.4544,
+      "step": 330
+    },
+    {
+      "epoch": 0.4097802537913959,
+      "grad_norm": 7.055054187774658,
+      "learning_rate": 5.0526380963313146e-05,
+      "loss": 18.3793,
+      "step": 331
+    },
+    {
+      "epoch": 0.4110182606004333,
+      "grad_norm": 8.435379981994629,
+      "learning_rate": 4.995072880644416e-05,
+      "loss": 19.1846,
+      "step": 332
+    },
+    {
+      "epoch": 0.41225626740947074,
+      "grad_norm": 7.945043563842773,
+      "learning_rate": 4.938261919255135e-05,
+      "loss": 18.6436,
+      "step": 333
+    },
+    {
+      "epoch": 0.4134942742185082,
+      "grad_norm": 9.138226509094238,
+      "learning_rate": 4.882208936647008e-05,
+      "loss": 18.4547,
+      "step": 334
+    },
+    {
+      "epoch": 0.41473228102754567,
+      "grad_norm": 8.763908386230469,
+      "learning_rate": 4.826917607611085e-05,
+      "loss": 17.9891,
+      "step": 335
+    },
+    {
+      "epoch": 0.4159702878365831,
+      "grad_norm": 9.440610885620117,
+      "learning_rate": 4.7723915570049596e-05,
+      "loss": 19.0053,
+      "step": 336
+    },
+    {
+      "epoch": 0.41720829464562054,
+      "grad_norm": 9.44798755645752,
+      "learning_rate": 4.718634359515167e-05,
+      "loss": 17.9599,
+      "step": 337
+    },
+    {
+      "epoch": 0.418446301454658,
+      "grad_norm": 9.94621753692627,
+      "learning_rate": 4.6656495394228076e-05,
+      "loss": 19.4213,
+      "step": 338
+    },
+    {
+      "epoch": 0.41968430826369546,
+      "grad_norm": 10.124737739562988,
+      "learning_rate": 4.613440570372504e-05,
+      "loss": 19.1964,
+      "step": 339
+    },
+    {
+      "epoch": 0.4209223150727329,
+      "grad_norm": 12.60027027130127,
+      "learning_rate": 4.562010875144683e-05,
+      "loss": 20.4584,
+      "step": 340
+    },
+    {
+      "epoch": 0.42216032188177033,
+      "grad_norm": 13.383706092834473,
+      "learning_rate": 4.511363825431157e-05,
+      "loss": 18.0053,
+      "step": 341
+    },
+    {
+      "epoch": 0.4233983286908078,
+      "grad_norm": 16.01007652282715,
+      "learning_rate": 4.461502741614107e-05,
+      "loss": 18.4217,
+      "step": 342
+    },
+    {
+      "epoch": 0.42463633549984525,
+      "grad_norm": 13.850798606872559,
+      "learning_rate": 4.4124308925483684e-05,
+      "loss": 18.0339,
+      "step": 343
+    },
+    {
+      "epoch": 0.4258743423088827,
+      "grad_norm": 12.687660217285156,
+      "learning_rate": 4.364151495347164e-05,
+      "loss": 19.0845,
+      "step": 344
+    },
+    {
+      "epoch": 0.4271123491179202,
+      "grad_norm": 14.462074279785156,
+      "learning_rate": 4.316667715171158e-05,
+      "loss": 21.136,
+      "step": 345
+    },
+    {
+      "epoch": 0.4283503559269576,
+      "grad_norm": 16.886581420898438,
+      "learning_rate": 4.269982665020967e-05,
+      "loss": 20.5538,
+      "step": 346
+    },
+    {
+      "epoch": 0.42958836273599504,
+      "grad_norm": 16.225481033325195,
+      "learning_rate": 4.2240994055330856e-05,
+      "loss": 19.9578,
+      "step": 347
+    },
+    {
+      "epoch": 0.4308263695450325,
+      "grad_norm": 20.5854434967041,
+      "learning_rate": 4.179020944779209e-05,
+      "loss": 21.8638,
+      "step": 348
+    },
+    {
+      "epoch": 0.43206437635406997,
+      "grad_norm": 24.899171829223633,
+      "learning_rate": 4.1347502380690375e-05,
+      "loss": 22.8508,
+      "step": 349
+    },
+    {
+      "epoch": 0.4333023831631074,
+      "grad_norm": 46.690704345703125,
+      "learning_rate": 4.091290187756536e-05,
+      "loss": 29.4439,
+      "step": 350
+    },
+    {
+      "epoch": 0.4333023831631074,
+      "eval_loss": 2.203415632247925,
+      "eval_runtime": 0.4597,
+      "eval_samples_per_second": 108.759,
+      "eval_steps_per_second": 15.226,
+      "step": 350
+    },
+    {
+      "epoch": 0.43454038997214484,
+      "grad_norm": 2.9752297401428223,
+      "learning_rate": 4.048643643049642e-05,
+      "loss": 18.6902,
+      "step": 351
+    },
+    {
+      "epoch": 0.4357783967811823,
+      "grad_norm": 3.8073084354400635,
+      "learning_rate": 4.006813399823494e-05,
+      "loss": 18.6591,
+      "step": 352
+    },
+    {
+      "epoch": 0.43701640359021976,
+      "grad_norm": 3.7948577404022217,
+      "learning_rate": 3.965802200437112e-05,
+      "loss": 18.1709,
+      "step": 353
+    },
+    {
+      "epoch": 0.4382544103992572,
+      "grad_norm": 3.713205337524414,
+      "learning_rate": 3.925612733553629e-05,
+      "loss": 18.4915,
+      "step": 354
+    },
+    {
+      "epoch": 0.43949241720829463,
+      "grad_norm": 3.6605148315429688,
+      "learning_rate": 3.88624763396402e-05,
+      "loss": 18.508,
+      "step": 355
+    },
+    {
+      "epoch": 0.4407304240173321,
+      "grad_norm": 3.8302948474884033,
+      "learning_rate": 3.8477094824143625e-05,
+      "loss": 19.2908,
+      "step": 356
+    },
+    {
+      "epoch": 0.44196843082636955,
+      "grad_norm": 3.5864696502685547,
+      "learning_rate": 3.810000805436647e-05,
+      "loss": 18.2617,
+      "step": 357
+    },
+    {
+      "epoch": 0.443206437635407,
+      "grad_norm": 3.4480526447296143,
+      "learning_rate": 3.7731240751831375e-05,
+      "loss": 18.558,
+      "step": 358
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 3.829530954360962,
+      "learning_rate": 3.7370817092643135e-05,
+      "loss": 18.6464,
+      "step": 359
+    },
+    {
+      "epoch": 0.4456824512534819,
+      "grad_norm": 3.834198236465454,
+      "learning_rate": 3.701876070590349e-05,
+      "loss": 18.5044,
+      "step": 360
+    },
+    {
+      "epoch": 0.44692045806251934,
+      "grad_norm": 3.4071412086486816,
+      "learning_rate": 3.667509467216217e-05,
+      "loss": 18.6708,
+      "step": 361
+    },
+    {
+      "epoch": 0.4481584648715568,
+      "grad_norm": 3.447110652923584,
+      "learning_rate": 3.633984152190378e-05,
+      "loss": 18.3933,
+      "step": 362
+    },
+    {
+      "epoch": 0.44939647168059427,
+      "grad_norm": 3.3696038722991943,
+      "learning_rate": 3.601302323407059e-05,
+      "loss": 19.3038,
+      "step": 363
+    },
+    {
+      "epoch": 0.4506344784896317,
+      "grad_norm": 3.346982717514038,
+      "learning_rate": 3.5694661234621745e-05,
+      "loss": 18.7913,
+      "step": 364
+    },
+    {
+      "epoch": 0.45187248529866914,
+      "grad_norm": 3.4483158588409424,
+      "learning_rate": 3.5384776395128484e-05,
+      "loss": 18.9989,
+      "step": 365
+    },
+    {
+      "epoch": 0.4531104921077066,
+      "grad_norm": 3.633026361465454,
+      "learning_rate": 3.508338903140592e-05,
+      "loss": 18.5967,
+      "step": 366
+    },
+    {
+      "epoch": 0.45434849891674406,
+      "grad_norm": 3.671412467956543,
+      "learning_rate": 3.4790518902181075e-05,
+      "loss": 18.9874,
+      "step": 367
+    },
+    {
+      "epoch": 0.45558650572578147,
+      "grad_norm": 3.686110734939575,
+      "learning_rate": 3.4506185207797495e-05,
+      "loss": 18.8547,
+      "step": 368
+    },
+    {
+      "epoch": 0.4568245125348189,
+      "grad_norm": 3.8215835094451904,
+      "learning_rate": 3.423040658895662e-05,
+      "loss": 19.5879,
+      "step": 369
+    },
+    {
+      "epoch": 0.4580625193438564,
+      "grad_norm": 4.052115440368652,
+      "learning_rate": 3.396320112549551e-05,
+      "loss": 19.4948,
+      "step": 370
+    },
+    {
+      "epoch": 0.45930052615289385,
+      "grad_norm": 4.376103401184082,
+      "learning_rate": 3.3704586335201794e-05,
+      "loss": 19.8034,
+      "step": 371
+    },
+    {
+      "epoch": 0.4605385329619313,
+      "grad_norm": 4.248034954071045,
+      "learning_rate": 3.345457917266499e-05,
+      "loss": 19.3474,
+      "step": 372
+    },
+    {
+      "epoch": 0.4617765397709687,
+      "grad_norm": 4.547921180725098,
+      "learning_rate": 3.321319602816507e-05,
+      "loss": 18.2825,
+      "step": 373
+    },
+    {
+      "epoch": 0.4630145465800062,
+      "grad_norm": 4.821180820465088,
+      "learning_rate": 3.298045272659797e-05,
+      "loss": 19.2673,
+      "step": 374
+    },
+    {
+      "epoch": 0.46425255338904364,
+      "grad_norm": 5.608479022979736,
+      "learning_rate": 3.275636452643802e-05,
+      "loss": 18.2961,
+      "step": 375
+    },
+    {
+      "epoch": 0.46425255338904364,
+      "eval_loss": 2.140087366104126,
+      "eval_runtime": 0.4588,
+      "eval_samples_per_second": 108.974,
+      "eval_steps_per_second": 15.256,
+      "step": 375
+    },
+    {
+      "epoch": 0.4654905601980811,
+      "grad_norm": 5.788666248321533,
+      "learning_rate": 3.254094611873773e-05,
+      "loss": 18.8808,
+      "step": 376
+    },
+    {
+      "epoch": 0.46672856700711857,
+      "grad_norm": 6.207448959350586,
+      "learning_rate": 3.2334211626164515e-05,
+      "loss": 18.5297,
+      "step": 377
+    },
+    {
+      "epoch": 0.467966573816156,
+      "grad_norm": 6.479159832000732,
+      "learning_rate": 3.213617460207498e-05,
+      "loss": 20.1104,
+      "step": 378
+    },
+    {
+      "epoch": 0.46920458062519343,
+      "grad_norm": 7.514153003692627,
+      "learning_rate": 3.1946848029626226e-05,
+      "loss": 18.3279,
+      "step": 379
+    },
+    {
+      "epoch": 0.4704425874342309,
+      "grad_norm": 8.025906562805176,
+      "learning_rate": 3.176624432092475e-05,
+      "loss": 17.8471,
+      "step": 380
+    },
+    {
+      "epoch": 0.47168059424326836,
+      "grad_norm": 8.171121597290039,
+      "learning_rate": 3.159437531621279e-05,
+      "loss": 20.1712,
+      "step": 381
+    },
+    {
+      "epoch": 0.47291860105230576,
+      "grad_norm": 9.144654273986816,
+      "learning_rate": 3.143125228309194e-05,
+      "loss": 17.2353,
+      "step": 382
+    },
+    {
+      "epoch": 0.4741566078613432,
+      "grad_norm": 7.720610618591309,
+      "learning_rate": 3.127688591578455e-05,
+      "loss": 19.4551,
+      "step": 383
+    },
+    {
+      "epoch": 0.4753946146703807,
+      "grad_norm": 10.915635108947754,
+      "learning_rate": 3.113128633443261e-05,
+      "loss": 17.2351,
+      "step": 384
+    },
+    {
+      "epoch": 0.47663262147941815,
+      "grad_norm": 10.034469604492188,
+      "learning_rate": 3.099446308443426e-05,
+      "loss": 18.8228,
+      "step": 385
+    },
+    {
+      "epoch": 0.4778706282884556,
+      "grad_norm": 11.461372375488281,
+      "learning_rate": 3.086642513581802e-05,
+      "loss": 18.355,
+      "step": 386
+    },
+    {
+      "epoch": 0.479108635097493,
+      "grad_norm": 11.581945419311523,
+      "learning_rate": 3.0747180882654696e-05,
+      "loss": 19.4607,
+      "step": 387
+    },
+    {
+      "epoch": 0.4803466419065305,
+      "grad_norm": 10.024002075195312,
+      "learning_rate": 3.0636738142507115e-05,
+      "loss": 18.5541,
+      "step": 388
+    },
+    {
+      "epoch": 0.48158464871556794,
+      "grad_norm": 12.054183006286621,
+      "learning_rate": 3.0535104155917565e-05,
+      "loss": 19.0649,
+      "step": 389
+    },
+    {
+      "epoch": 0.4828226555246054,
+      "grad_norm": 12.183855056762695,
+      "learning_rate": 3.044228558593313e-05,
+      "loss": 19.0591,
+      "step": 390
+    },
+    {
+      "epoch": 0.4840606623336428,
+      "grad_norm": 11.676289558410645,
+      "learning_rate": 3.0358288517668886e-05,
+      "loss": 17.4302,
+      "step": 391
+    },
+    {
+      "epoch": 0.4852986691426803,
+      "grad_norm": 13.100257873535156,
+      "learning_rate": 3.028311845790893e-05,
+      "loss": 18.6739,
+      "step": 392
+    },
+    {
+      "epoch": 0.48653667595171773,
+      "grad_norm": 13.003954887390137,
+      "learning_rate": 3.0216780334745368e-05,
+      "loss": 18.2582,
+      "step": 393
+    },
+    {
+      "epoch": 0.4877746827607552,
+      "grad_norm": 13.394844055175781,
+      "learning_rate": 3.0159278497255256e-05,
+      "loss": 20.41,
+      "step": 394
+    },
+    {
+      "epoch": 0.48901268956979266,
+      "grad_norm": 16.421558380126953,
+      "learning_rate": 3.0110616715215444e-05,
+      "loss": 19.9794,
+      "step": 395
+    },
+    {
+      "epoch": 0.49025069637883006,
+      "grad_norm": 14.601800918579102,
+      "learning_rate": 3.0070798178855484e-05,
+      "loss": 19.4204,
+      "step": 396
+    },
+    {
+      "epoch": 0.4914887031878675,
+      "grad_norm": 19.369102478027344,
+      "learning_rate": 3.0039825498648413e-05,
+      "loss": 20.4144,
+      "step": 397
+    },
+    {
+      "epoch": 0.492726709996905,
+      "grad_norm": 22.31803321838379,
+      "learning_rate": 3.001770070513965e-05,
+      "loss": 22.4627,
+      "step": 398
+    },
+    {
+      "epoch": 0.49396471680594245,
+      "grad_norm": 25.993896484375,
+      "learning_rate": 3.0004425248813897e-05,
+      "loss": 26.4305,
+      "step": 399
+    },
+    {
+      "epoch": 0.49520272361497986,
+      "grad_norm": 46.47043991088867,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 26.5223,
+      "step": 400
+    },
+    {
+      "epoch": 0.49520272361497986,
+      "eval_loss": 2.152632236480713,
+      "eval_runtime": 0.459,
+      "eval_samples_per_second": 108.932,
+      "eval_steps_per_second": 15.251,
+      "step": 400
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 400,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 30,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.71737748979712e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6e2f2eca20ee7f0dc810e31bbc4e91bca5e2910260904879ec847a70cdc12d
+size 6776

last-checkpoint/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "add_bos_token": true,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "errors": "replace",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "</s>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6e2f2eca20ee7f0dc810e31bbc4e91bca5e2910260904879ec847a70cdc12d
+size 6776

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff