Safetensors
qwen2
Mia Fournier commited on
Commit
64ddfa1
1 Parent(s): f723a5e

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "MangyMango/testing1",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 131072,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": null,
21
+ "tie_word_embeddings": true,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.44.0.dev0",
24
+ "use_cache": false,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 151936
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.44.0.dev0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cc8093d8748bd1a89ef3c69a3aa8b8c0dd2ea4819c7ad0149deac76342efe61
3
+ size 3087467144
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f275a7f70afd86d3fb0dfb6c51c63f1488f40d5cd4b6b4a0526024ecf6208dc
3
+ size 6175147196
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d245e05e72192c132e0f2edb6fdcae0c578c890f0fe912f17ec7b0bba2d38cc3
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af3b6ec6e263e8506305011321cc19af20c5e04b33437bc261ef469e8b957f07
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|endoftext|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
trainer_state.json ADDED
@@ -0,0 +1,2715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9814460437640404,
5
+ "eval_steps": 47,
6
+ "global_step": 374,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005324902238122972,
13
+ "grad_norm": 1.5,
14
+ "learning_rate": 1.4285714285714286e-06,
15
+ "loss": 1.7903,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.005324902238122972,
20
+ "eval_loss": 1.775637149810791,
21
+ "eval_runtime": 77.1748,
22
+ "eval_samples_per_second": 15.29,
23
+ "eval_steps_per_second": 15.29,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.010649804476245944,
28
+ "grad_norm": 1.484375,
29
+ "learning_rate": 2.8571428571428573e-06,
30
+ "loss": 1.761,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.015974706714368916,
35
+ "grad_norm": 1.5546875,
36
+ "learning_rate": 4.2857142857142855e-06,
37
+ "loss": 1.8033,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.02129960895249189,
42
+ "grad_norm": 1.3671875,
43
+ "learning_rate": 5.7142857142857145e-06,
44
+ "loss": 1.7335,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.02662451119061486,
49
+ "grad_norm": 1.484375,
50
+ "learning_rate": 7.1428571428571436e-06,
51
+ "loss": 1.7532,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.03194941342873783,
56
+ "grad_norm": 1.3203125,
57
+ "learning_rate": 8.571428571428571e-06,
58
+ "loss": 1.7546,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.037274315666860805,
63
+ "grad_norm": 1.21875,
64
+ "learning_rate": 1e-05,
65
+ "loss": 1.8068,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.04259921790498378,
70
+ "grad_norm": 1.1796875,
71
+ "learning_rate": 1.1428571428571429e-05,
72
+ "loss": 1.8295,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.04792412014310675,
77
+ "grad_norm": 1.078125,
78
+ "learning_rate": 1.2857142857142859e-05,
79
+ "loss": 1.7343,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.05324902238122972,
84
+ "grad_norm": 1.25,
85
+ "learning_rate": 1.4285714285714287e-05,
86
+ "loss": 1.8318,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.058573924619352694,
91
+ "grad_norm": 1.1796875,
92
+ "learning_rate": 1.5714285714285715e-05,
93
+ "loss": 1.8064,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.06389882685747567,
98
+ "grad_norm": 1.1484375,
99
+ "learning_rate": 1.7142857142857142e-05,
100
+ "loss": 1.7317,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.06922372909559864,
105
+ "grad_norm": 1.2578125,
106
+ "learning_rate": 1.8571428571428575e-05,
107
+ "loss": 1.8281,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.07454863133372161,
112
+ "grad_norm": 1.1328125,
113
+ "learning_rate": 2e-05,
114
+ "loss": 1.6915,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.07987353357184458,
119
+ "grad_norm": 1.1484375,
120
+ "learning_rate": 1.9999619230641714e-05,
121
+ "loss": 1.7509,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.08519843580996755,
126
+ "grad_norm": 1.0859375,
127
+ "learning_rate": 1.9998476951563914e-05,
128
+ "loss": 1.7416,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.09052333804809053,
133
+ "grad_norm": 1.0859375,
134
+ "learning_rate": 1.9996573249755573e-05,
135
+ "loss": 1.7,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.0958482402862135,
140
+ "grad_norm": 1.1640625,
141
+ "learning_rate": 1.999390827019096e-05,
142
+ "loss": 1.771,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.10117314252433647,
147
+ "grad_norm": 1.1171875,
148
+ "learning_rate": 1.999048221581858e-05,
149
+ "loss": 1.7097,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.10649804476245944,
154
+ "grad_norm": 1.1015625,
155
+ "learning_rate": 1.9986295347545738e-05,
156
+ "loss": 1.7411,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.11182294700058241,
161
+ "grad_norm": 1.046875,
162
+ "learning_rate": 1.998134798421867e-05,
163
+ "loss": 1.7664,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.11714784923870539,
168
+ "grad_norm": 1.1171875,
169
+ "learning_rate": 1.9975640502598243e-05,
170
+ "loss": 1.7283,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.12247275147682836,
175
+ "grad_norm": 1.1015625,
176
+ "learning_rate": 1.9969173337331283e-05,
177
+ "loss": 1.8261,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.12779765371495133,
182
+ "grad_norm": 1.078125,
183
+ "learning_rate": 1.9961946980917457e-05,
184
+ "loss": 1.7096,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.1331225559530743,
189
+ "grad_norm": 1.078125,
190
+ "learning_rate": 1.9953961983671792e-05,
191
+ "loss": 1.8025,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.13844745819119728,
196
+ "grad_norm": 1.0625,
197
+ "learning_rate": 1.9945218953682736e-05,
198
+ "loss": 1.7506,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.14377236042932023,
203
+ "grad_norm": 1.0546875,
204
+ "learning_rate": 1.9935718556765878e-05,
205
+ "loss": 1.7492,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.14909726266744322,
210
+ "grad_norm": 1.078125,
211
+ "learning_rate": 1.9925461516413224e-05,
212
+ "loss": 1.7108,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.15442216490556618,
217
+ "grad_norm": 1.0546875,
218
+ "learning_rate": 1.9914448613738107e-05,
219
+ "loss": 1.6542,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.15974706714368916,
224
+ "grad_norm": 1.0234375,
225
+ "learning_rate": 1.9902680687415704e-05,
226
+ "loss": 1.7548,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.16507196938181212,
231
+ "grad_norm": 1.1171875,
232
+ "learning_rate": 1.989015863361917e-05,
233
+ "loss": 1.7417,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.1703968716199351,
238
+ "grad_norm": 1.0546875,
239
+ "learning_rate": 1.9876883405951378e-05,
240
+ "loss": 1.6839,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.17572177385805807,
245
+ "grad_norm": 1.0,
246
+ "learning_rate": 1.9862856015372315e-05,
247
+ "loss": 1.6547,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.18104667609618105,
252
+ "grad_norm": 1.0625,
253
+ "learning_rate": 1.9848077530122083e-05,
254
+ "loss": 1.7696,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.186371578334304,
259
+ "grad_norm": 1.1484375,
260
+ "learning_rate": 1.983254907563955e-05,
261
+ "loss": 1.6861,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.191696480572427,
266
+ "grad_norm": 1.0703125,
267
+ "learning_rate": 1.9816271834476642e-05,
268
+ "loss": 1.7726,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.19702138281054996,
273
+ "grad_norm": 1.0078125,
274
+ "learning_rate": 1.9799247046208297e-05,
275
+ "loss": 1.7046,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.20234628504867294,
280
+ "grad_norm": 1.15625,
281
+ "learning_rate": 1.9781476007338058e-05,
282
+ "loss": 1.7071,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.2076711872867959,
287
+ "grad_norm": 1.0859375,
288
+ "learning_rate": 1.9762960071199334e-05,
289
+ "loss": 1.8033,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.21299608952491889,
294
+ "grad_norm": 1.0859375,
295
+ "learning_rate": 1.9743700647852356e-05,
296
+ "loss": 1.8173,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.21832099176304184,
301
+ "grad_norm": 1.1171875,
302
+ "learning_rate": 1.9723699203976768e-05,
303
+ "loss": 1.7546,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.22364589400116483,
308
+ "grad_norm": 1.09375,
309
+ "learning_rate": 1.9702957262759964e-05,
310
+ "loss": 1.7599,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.2289707962392878,
315
+ "grad_norm": 1.03125,
316
+ "learning_rate": 1.968147640378108e-05,
317
+ "loss": 1.6765,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.23429569847741077,
322
+ "grad_norm": 1.046875,
323
+ "learning_rate": 1.9659258262890683e-05,
324
+ "loss": 1.7651,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 0.23962060071553373,
329
+ "grad_norm": 1.0546875,
330
+ "learning_rate": 1.963630453208623e-05,
331
+ "loss": 1.6026,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 0.24494550295365672,
336
+ "grad_norm": 1.1015625,
337
+ "learning_rate": 1.961261695938319e-05,
338
+ "loss": 1.7446,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 0.2502704051917797,
343
+ "grad_norm": 1.015625,
344
+ "learning_rate": 1.958819734868193e-05,
345
+ "loss": 1.7374,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 0.2502704051917797,
350
+ "eval_loss": 1.7246959209442139,
351
+ "eval_runtime": 84.4189,
352
+ "eval_samples_per_second": 13.978,
353
+ "eval_steps_per_second": 13.978,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.25559530742990266,
358
+ "grad_norm": 1.0390625,
359
+ "learning_rate": 1.9563047559630356e-05,
360
+ "loss": 1.7224,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.26092020966802565,
365
+ "grad_norm": 1.0703125,
366
+ "learning_rate": 1.953716950748227e-05,
367
+ "loss": 1.7919,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.2662451119061486,
372
+ "grad_norm": 1.0625,
373
+ "learning_rate": 1.9510565162951538e-05,
374
+ "loss": 1.728,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.27157001414427157,
379
+ "grad_norm": 1.109375,
380
+ "learning_rate": 1.9483236552061996e-05,
381
+ "loss": 1.758,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.27689491638239455,
386
+ "grad_norm": 1.0078125,
387
+ "learning_rate": 1.945518575599317e-05,
388
+ "loss": 1.6651,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.28221981862051754,
393
+ "grad_norm": 1.1015625,
394
+ "learning_rate": 1.9426414910921785e-05,
395
+ "loss": 1.7097,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.28754472085864047,
400
+ "grad_norm": 1.0703125,
401
+ "learning_rate": 1.9396926207859085e-05,
402
+ "loss": 1.7315,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.29286962309676345,
407
+ "grad_norm": 1.140625,
408
+ "learning_rate": 1.9366721892483976e-05,
409
+ "loss": 1.7535,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.29819452533488644,
414
+ "grad_norm": 1.0859375,
415
+ "learning_rate": 1.9335804264972018e-05,
416
+ "loss": 1.7806,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.3035194275730094,
421
+ "grad_norm": 1.046875,
422
+ "learning_rate": 1.9304175679820247e-05,
423
+ "loss": 1.7565,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.30884432981113236,
428
+ "grad_norm": 1.0234375,
429
+ "learning_rate": 1.9271838545667876e-05,
430
+ "loss": 1.7056,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.31416923204925534,
435
+ "grad_norm": 0.9921875,
436
+ "learning_rate": 1.9238795325112867e-05,
437
+ "loss": 1.7266,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.31949413428737833,
442
+ "grad_norm": 1.046875,
443
+ "learning_rate": 1.9205048534524405e-05,
444
+ "loss": 1.6798,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.3248190365255013,
449
+ "grad_norm": 1.1328125,
450
+ "learning_rate": 1.917060074385124e-05,
451
+ "loss": 1.6847,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.33014393876362425,
456
+ "grad_norm": 1.0390625,
457
+ "learning_rate": 1.913545457642601e-05,
458
+ "loss": 1.6441,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.33546884100174723,
463
+ "grad_norm": 1.0234375,
464
+ "learning_rate": 1.9099612708765432e-05,
465
+ "loss": 1.6839,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.3407937432398702,
470
+ "grad_norm": 1.0625,
471
+ "learning_rate": 1.9063077870366504e-05,
472
+ "loss": 1.6653,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.3461186454779932,
477
+ "grad_norm": 1.0,
478
+ "learning_rate": 1.902585284349861e-05,
479
+ "loss": 1.6688,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.35144354771611613,
484
+ "grad_norm": 1.046875,
485
+ "learning_rate": 1.8987940462991673e-05,
486
+ "loss": 1.6317,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.3567684499542391,
491
+ "grad_norm": 0.94921875,
492
+ "learning_rate": 1.894934361602025e-05,
493
+ "loss": 1.6889,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.3620933521923621,
498
+ "grad_norm": 1.1015625,
499
+ "learning_rate": 1.891006524188368e-05,
500
+ "loss": 1.7744,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.3674182544304851,
505
+ "grad_norm": 1.109375,
506
+ "learning_rate": 1.887010833178222e-05,
507
+ "loss": 1.7525,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.372743156668608,
512
+ "grad_norm": 1.03125,
513
+ "learning_rate": 1.8829475928589272e-05,
514
+ "loss": 1.5786,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.378068058906731,
519
+ "grad_norm": 0.984375,
520
+ "learning_rate": 1.8788171126619653e-05,
521
+ "loss": 1.7715,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.383392961144854,
526
+ "grad_norm": 1.0078125,
527
+ "learning_rate": 1.874619707139396e-05,
528
+ "loss": 1.6526,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.388717863382977,
533
+ "grad_norm": 1.0390625,
534
+ "learning_rate": 1.8703556959398998e-05,
535
+ "loss": 1.7511,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.3940427656210999,
540
+ "grad_norm": 1.046875,
541
+ "learning_rate": 1.866025403784439e-05,
542
+ "loss": 1.789,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.3993676678592229,
547
+ "grad_norm": 1.078125,
548
+ "learning_rate": 1.861629160441526e-05,
549
+ "loss": 1.7705,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.4046925700973459,
554
+ "grad_norm": 1.0625,
555
+ "learning_rate": 1.8571673007021124e-05,
556
+ "loss": 1.7042,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.4100174723354688,
561
+ "grad_norm": 0.98828125,
562
+ "learning_rate": 1.8526401643540924e-05,
563
+ "loss": 1.6971,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.4153423745735918,
568
+ "grad_norm": 0.9765625,
569
+ "learning_rate": 1.848048096156426e-05,
570
+ "loss": 1.6873,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.4206672768117148,
575
+ "grad_norm": 0.953125,
576
+ "learning_rate": 1.843391445812886e-05,
577
+ "loss": 1.6937,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.42599217904983777,
582
+ "grad_norm": 1.0234375,
583
+ "learning_rate": 1.8386705679454243e-05,
584
+ "loss": 1.6798,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.4313170812879607,
589
+ "grad_norm": 1.0703125,
590
+ "learning_rate": 1.8338858220671683e-05,
591
+ "loss": 1.7316,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.4366419835260837,
596
+ "grad_norm": 1.0390625,
597
+ "learning_rate": 1.8290375725550417e-05,
598
+ "loss": 1.6617,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.4419668857642067,
603
+ "grad_norm": 1.078125,
604
+ "learning_rate": 1.8241261886220155e-05,
605
+ "loss": 1.7817,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.44729178800232966,
610
+ "grad_norm": 1.046875,
611
+ "learning_rate": 1.819152044288992e-05,
612
+ "loss": 1.7649,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.4526166902404526,
617
+ "grad_norm": 1.0546875,
618
+ "learning_rate": 1.8141155183563195e-05,
619
+ "loss": 1.7158,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.4579415924785756,
624
+ "grad_norm": 1.1171875,
625
+ "learning_rate": 1.8090169943749477e-05,
626
+ "loss": 1.7345,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.46326649471669856,
631
+ "grad_norm": 1.0234375,
632
+ "learning_rate": 1.8038568606172172e-05,
633
+ "loss": 1.6563,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.46859139695482155,
638
+ "grad_norm": 1.0234375,
639
+ "learning_rate": 1.798635510047293e-05,
640
+ "loss": 1.726,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.4739162991929445,
645
+ "grad_norm": 0.9921875,
646
+ "learning_rate": 1.7933533402912354e-05,
647
+ "loss": 1.6767,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.47924120143106747,
652
+ "grad_norm": 1.0234375,
653
+ "learning_rate": 1.788010753606722e-05,
654
+ "loss": 1.6585,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.48456610366919045,
659
+ "grad_norm": 0.9609375,
660
+ "learning_rate": 1.782608156852414e-05,
661
+ "loss": 1.6202,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.48989100590731344,
666
+ "grad_norm": 1.0234375,
667
+ "learning_rate": 1.777145961456971e-05,
668
+ "loss": 1.7857,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.49521590814543637,
673
+ "grad_norm": 1.0546875,
674
+ "learning_rate": 1.7716245833877202e-05,
675
+ "loss": 1.7479,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.5005408103835594,
680
+ "grad_norm": 1.0625,
681
+ "learning_rate": 1.766044443118978e-05,
682
+ "loss": 1.7516,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.5005408103835594,
687
+ "eval_loss": 1.7072687149047852,
688
+ "eval_runtime": 77.7379,
689
+ "eval_samples_per_second": 15.179,
690
+ "eval_steps_per_second": 15.179,
691
+ "step": 94
692
+ },
693
+ {
694
+ "epoch": 0.5058657126216823,
695
+ "grad_norm": 1.0703125,
696
+ "learning_rate": 1.7604059656000313e-05,
697
+ "loss": 1.6983,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 0.5111906148598053,
702
+ "grad_norm": 1.078125,
703
+ "learning_rate": 1.7547095802227723e-05,
704
+ "loss": 1.7144,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.5165155170979283,
709
+ "grad_norm": 1.0546875,
710
+ "learning_rate": 1.7489557207890025e-05,
711
+ "loss": 1.754,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.5218404193360513,
716
+ "grad_norm": 1.03125,
717
+ "learning_rate": 1.7431448254773943e-05,
718
+ "loss": 1.7862,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.5271653215741742,
723
+ "grad_norm": 1.046875,
724
+ "learning_rate": 1.737277336810124e-05,
725
+ "loss": 1.7244,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.5324902238122972,
730
+ "grad_norm": 1.0546875,
731
+ "learning_rate": 1.7313537016191706e-05,
732
+ "loss": 1.7129,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.5378151260504201,
737
+ "grad_norm": 1.0390625,
738
+ "learning_rate": 1.7253743710122877e-05,
739
+ "loss": 1.6302,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.5431400282885431,
744
+ "grad_norm": 0.99609375,
745
+ "learning_rate": 1.7193398003386514e-05,
746
+ "loss": 1.6831,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.5484649305266661,
751
+ "grad_norm": 1.0,
752
+ "learning_rate": 1.713250449154182e-05,
753
+ "loss": 1.74,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.5537898327647891,
758
+ "grad_norm": 1.0,
759
+ "learning_rate": 1.7071067811865477e-05,
760
+ "loss": 1.7333,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.5591147350029121,
765
+ "grad_norm": 1.0546875,
766
+ "learning_rate": 1.700909264299851e-05,
767
+ "loss": 1.7359,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.5644396372410351,
772
+ "grad_norm": 0.98828125,
773
+ "learning_rate": 1.6946583704589973e-05,
774
+ "loss": 1.7432,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.569764539479158,
779
+ "grad_norm": 1.0078125,
780
+ "learning_rate": 1.688354575693754e-05,
781
+ "loss": 1.6974,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.5750894417172809,
786
+ "grad_norm": 1.03125,
787
+ "learning_rate": 1.6819983600624986e-05,
788
+ "loss": 1.7011,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.5804143439554039,
793
+ "grad_norm": 1.0703125,
794
+ "learning_rate": 1.6755902076156606e-05,
795
+ "loss": 1.665,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.5857392461935269,
800
+ "grad_norm": 1.125,
801
+ "learning_rate": 1.6691306063588583e-05,
802
+ "loss": 1.8047,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.5910641484316499,
807
+ "grad_norm": 0.9921875,
808
+ "learning_rate": 1.6626200482157378e-05,
809
+ "loss": 1.6664,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.5963890506697729,
814
+ "grad_norm": 1.0234375,
815
+ "learning_rate": 1.6560590289905074e-05,
816
+ "loss": 1.7291,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.6017139529078959,
821
+ "grad_norm": 1.0859375,
822
+ "learning_rate": 1.6494480483301836e-05,
823
+ "loss": 1.709,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.6070388551460189,
828
+ "grad_norm": 1.0234375,
829
+ "learning_rate": 1.6427876096865394e-05,
830
+ "loss": 1.7286,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.6123637573841417,
835
+ "grad_norm": 1.0078125,
836
+ "learning_rate": 1.636078220277764e-05,
837
+ "loss": 1.6269,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.6176886596222647,
842
+ "grad_norm": 1.0234375,
843
+ "learning_rate": 1.6293203910498375e-05,
844
+ "loss": 1.6286,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.6230135618603877,
849
+ "grad_norm": 1.09375,
850
+ "learning_rate": 1.6225146366376198e-05,
851
+ "loss": 1.78,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.6283384640985107,
856
+ "grad_norm": 1.078125,
857
+ "learning_rate": 1.6156614753256583e-05,
858
+ "loss": 1.7336,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.6336633663366337,
863
+ "grad_norm": 1.0625,
864
+ "learning_rate": 1.608761429008721e-05,
865
+ "loss": 1.6578,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.6389882685747567,
870
+ "grad_norm": 1.015625,
871
+ "learning_rate": 1.6018150231520486e-05,
872
+ "loss": 1.6736,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.6443131708128796,
877
+ "grad_norm": 1.0546875,
878
+ "learning_rate": 1.5948227867513416e-05,
879
+ "loss": 1.5976,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.6496380730510026,
884
+ "grad_norm": 1.0078125,
885
+ "learning_rate": 1.5877852522924733e-05,
886
+ "loss": 1.7155,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.6549629752891255,
891
+ "grad_norm": 1.0078125,
892
+ "learning_rate": 1.5807029557109398e-05,
893
+ "loss": 1.6595,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.6602878775272485,
898
+ "grad_norm": 1.0234375,
899
+ "learning_rate": 1.573576436351046e-05,
900
+ "loss": 1.7354,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.6656127797653715,
905
+ "grad_norm": 1.0703125,
906
+ "learning_rate": 1.566406236924833e-05,
907
+ "loss": 1.7401,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.6709376820034945,
912
+ "grad_norm": 1.0625,
913
+ "learning_rate": 1.5591929034707468e-05,
914
+ "loss": 1.6774,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 0.6762625842416174,
919
+ "grad_norm": 1.0078125,
920
+ "learning_rate": 1.5519369853120584e-05,
921
+ "loss": 1.6818,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 0.6815874864797404,
926
+ "grad_norm": 1.0390625,
927
+ "learning_rate": 1.5446390350150272e-05,
928
+ "loss": 1.598,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 0.6869123887178634,
933
+ "grad_norm": 1.015625,
934
+ "learning_rate": 1.5372996083468242e-05,
935
+ "loss": 1.7103,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 0.6922372909559864,
940
+ "grad_norm": 1.015625,
941
+ "learning_rate": 1.529919264233205e-05,
942
+ "loss": 1.6196,
943
+ "step": 130
944
+ },
945
+ {
946
+ "epoch": 0.6975621931941093,
947
+ "grad_norm": 1.046875,
948
+ "learning_rate": 1.5224985647159489e-05,
949
+ "loss": 1.618,
950
+ "step": 131
951
+ },
952
+ {
953
+ "epoch": 0.7028870954322323,
954
+ "grad_norm": 1.03125,
955
+ "learning_rate": 1.5150380749100545e-05,
956
+ "loss": 1.7159,
957
+ "step": 132
958
+ },
959
+ {
960
+ "epoch": 0.7082119976703553,
961
+ "grad_norm": 0.9921875,
962
+ "learning_rate": 1.5075383629607043e-05,
963
+ "loss": 1.6372,
964
+ "step": 133
965
+ },
966
+ {
967
+ "epoch": 0.7135368999084782,
968
+ "grad_norm": 1.0625,
969
+ "learning_rate": 1.5000000000000002e-05,
970
+ "loss": 1.7368,
971
+ "step": 134
972
+ },
973
+ {
974
+ "epoch": 0.7188618021466012,
975
+ "grad_norm": 0.97265625,
976
+ "learning_rate": 1.4924235601034673e-05,
977
+ "loss": 1.6675,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 0.7241867043847242,
982
+ "grad_norm": 1.078125,
983
+ "learning_rate": 1.4848096202463373e-05,
984
+ "loss": 1.6651,
985
+ "step": 136
986
+ },
987
+ {
988
+ "epoch": 0.7295116066228472,
989
+ "grad_norm": 1.0078125,
990
+ "learning_rate": 1.4771587602596085e-05,
991
+ "loss": 1.6842,
992
+ "step": 137
993
+ },
994
+ {
995
+ "epoch": 0.7348365088609702,
996
+ "grad_norm": 1.015625,
997
+ "learning_rate": 1.469471562785891e-05,
998
+ "loss": 1.682,
999
+ "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.7401614110990931,
1003
+ "grad_norm": 1.03125,
1004
+ "learning_rate": 1.4617486132350343e-05,
1005
+ "loss": 1.697,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.745486313337216,
1010
+ "grad_norm": 1.03125,
1011
+ "learning_rate": 1.4539904997395468e-05,
1012
+ "loss": 1.6511,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.750811215575339,
1017
+ "grad_norm": 1.0625,
1018
+ "learning_rate": 1.4461978131098089e-05,
1019
+ "loss": 1.6586,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.750811215575339,
1024
+ "eval_loss": 1.6974835395812988,
1025
+ "eval_runtime": 77.2891,
1026
+ "eval_samples_per_second": 15.267,
1027
+ "eval_steps_per_second": 15.267,
1028
+ "step": 141
1029
+ },
1030
+ {
1031
+ "epoch": 0.756136117813462,
1032
+ "grad_norm": 1.0625,
1033
+ "learning_rate": 1.4383711467890776e-05,
1034
+ "loss": 1.7338,
1035
+ "step": 142
1036
+ },
1037
+ {
1038
+ "epoch": 0.761461020051585,
1039
+ "grad_norm": 1.0234375,
1040
+ "learning_rate": 1.4305110968082953e-05,
1041
+ "loss": 1.6623,
1042
+ "step": 143
1043
+ },
1044
+ {
1045
+ "epoch": 0.766785922289708,
1046
+ "grad_norm": 1.03125,
1047
+ "learning_rate": 1.4226182617406996e-05,
1048
+ "loss": 1.6748,
1049
+ "step": 144
1050
+ },
1051
+ {
1052
+ "epoch": 0.772110824527831,
1053
+ "grad_norm": 1.0,
1054
+ "learning_rate": 1.4146932426562391e-05,
1055
+ "loss": 1.7057,
1056
+ "step": 145
1057
+ },
1058
+ {
1059
+ "epoch": 0.777435726765954,
1060
+ "grad_norm": 1.0234375,
1061
+ "learning_rate": 1.4067366430758004e-05,
1062
+ "loss": 1.6403,
1063
+ "step": 146
1064
+ },
1065
+ {
1066
+ "epoch": 0.7827606290040768,
1067
+ "grad_norm": 1.09375,
1068
+ "learning_rate": 1.3987490689252463e-05,
1069
+ "loss": 1.7262,
1070
+ "step": 147
1071
+ },
1072
+ {
1073
+ "epoch": 0.7880855312421998,
1074
+ "grad_norm": 1.0703125,
1075
+ "learning_rate": 1.3907311284892737e-05,
1076
+ "loss": 1.7561,
1077
+ "step": 148
1078
+ },
1079
+ {
1080
+ "epoch": 0.7934104334803228,
1081
+ "grad_norm": 1.109375,
1082
+ "learning_rate": 1.3826834323650899e-05,
1083
+ "loss": 1.6847,
1084
+ "step": 149
1085
+ },
1086
+ {
1087
+ "epoch": 0.7987353357184458,
1088
+ "grad_norm": 1.015625,
1089
+ "learning_rate": 1.3746065934159123e-05,
1090
+ "loss": 1.5979,
1091
+ "step": 150
1092
+ },
1093
+ {
1094
+ "epoch": 0.8040602379565688,
1095
+ "grad_norm": 1.0859375,
1096
+ "learning_rate": 1.3665012267242974e-05,
1097
+ "loss": 1.6537,
1098
+ "step": 151
1099
+ },
1100
+ {
1101
+ "epoch": 0.8093851401946918,
1102
+ "grad_norm": 1.03125,
1103
+ "learning_rate": 1.3583679495453e-05,
1104
+ "loss": 1.7491,
1105
+ "step": 152
1106
+ },
1107
+ {
1108
+ "epoch": 0.8147100424328148,
1109
+ "grad_norm": 1.0,
1110
+ "learning_rate": 1.3502073812594677e-05,
1111
+ "loss": 1.6909,
1112
+ "step": 153
1113
+ },
1114
+ {
1115
+ "epoch": 0.8200349446709376,
1116
+ "grad_norm": 1.0390625,
1117
+ "learning_rate": 1.342020143325669e-05,
1118
+ "loss": 1.6917,
1119
+ "step": 154
1120
+ },
1121
+ {
1122
+ "epoch": 0.8253598469090606,
1123
+ "grad_norm": 0.99609375,
1124
+ "learning_rate": 1.333806859233771e-05,
1125
+ "loss": 1.692,
1126
+ "step": 155
1127
+ },
1128
+ {
1129
+ "epoch": 0.8306847491471836,
1130
+ "grad_norm": 1.0390625,
1131
+ "learning_rate": 1.3255681544571568e-05,
1132
+ "loss": 1.6995,
1133
+ "step": 156
1134
+ },
1135
+ {
1136
+ "epoch": 0.8360096513853066,
1137
+ "grad_norm": 1.03125,
1138
+ "learning_rate": 1.3173046564050923e-05,
1139
+ "loss": 1.6612,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 0.8413345536234296,
1144
+ "grad_norm": 1.09375,
1145
+ "learning_rate": 1.3090169943749475e-05,
1146
+ "loss": 1.7376,
1147
+ "step": 158
1148
+ },
1149
+ {
1150
+ "epoch": 0.8466594558615526,
1151
+ "grad_norm": 1.0,
1152
+ "learning_rate": 1.300705799504273e-05,
1153
+ "loss": 1.6703,
1154
+ "step": 159
1155
+ },
1156
+ {
1157
+ "epoch": 0.8519843580996755,
1158
+ "grad_norm": 1.0546875,
1159
+ "learning_rate": 1.2923717047227368e-05,
1160
+ "loss": 1.6611,
1161
+ "step": 160
1162
+ },
1163
+ {
1164
+ "epoch": 0.8573092603377985,
1165
+ "grad_norm": 0.95703125,
1166
+ "learning_rate": 1.284015344703923e-05,
1167
+ "loss": 1.6215,
1168
+ "step": 161
1169
+ },
1170
+ {
1171
+ "epoch": 0.8626341625759214,
1172
+ "grad_norm": 1.046875,
1173
+ "learning_rate": 1.2756373558169992e-05,
1174
+ "loss": 1.7199,
1175
+ "step": 162
1176
+ },
1177
+ {
1178
+ "epoch": 0.8679590648140444,
1179
+ "grad_norm": 1.0078125,
1180
+ "learning_rate": 1.267238376078257e-05,
1181
+ "loss": 1.6418,
1182
+ "step": 163
1183
+ },
1184
+ {
1185
+ "epoch": 0.8732839670521674,
1186
+ "grad_norm": 0.9140625,
1187
+ "learning_rate": 1.2588190451025209e-05,
1188
+ "loss": 1.6476,
1189
+ "step": 164
1190
+ },
1191
+ {
1192
+ "epoch": 0.8786088692902904,
1193
+ "grad_norm": 0.96484375,
1194
+ "learning_rate": 1.2503800040544417e-05,
1195
+ "loss": 1.6088,
1196
+ "step": 165
1197
+ },
1198
+ {
1199
+ "epoch": 0.8839337715284133,
1200
+ "grad_norm": 0.9921875,
1201
+ "learning_rate": 1.2419218955996677e-05,
1202
+ "loss": 1.6919,
1203
+ "step": 166
1204
+ },
1205
+ {
1206
+ "epoch": 0.8892586737665363,
1207
+ "grad_norm": 1.0625,
1208
+ "learning_rate": 1.2334453638559057e-05,
1209
+ "loss": 1.6118,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 0.8945835760046593,
1214
+ "grad_norm": 1.0546875,
1215
+ "learning_rate": 1.2249510543438652e-05,
1216
+ "loss": 1.7246,
1217
+ "step": 168
1218
+ },
1219
+ {
1220
+ "epoch": 0.8999084782427823,
1221
+ "grad_norm": 1.0390625,
1222
+ "learning_rate": 1.2164396139381029e-05,
1223
+ "loss": 1.6821,
1224
+ "step": 169
1225
+ },
1226
+ {
1227
+ "epoch": 0.9052333804809052,
1228
+ "grad_norm": 1.0625,
1229
+ "learning_rate": 1.2079116908177592e-05,
1230
+ "loss": 1.6474,
1231
+ "step": 170
1232
+ },
1233
+ {
1234
+ "epoch": 0.9105582827190282,
1235
+ "grad_norm": 1.1015625,
1236
+ "learning_rate": 1.1993679344171973e-05,
1237
+ "loss": 1.7538,
1238
+ "step": 171
1239
+ },
1240
+ {
1241
+ "epoch": 0.9158831849571512,
1242
+ "grad_norm": 1.0078125,
1243
+ "learning_rate": 1.190808995376545e-05,
1244
+ "loss": 1.6299,
1245
+ "step": 172
1246
+ },
1247
+ {
1248
+ "epoch": 0.9212080871952741,
1249
+ "grad_norm": 1.0078125,
1250
+ "learning_rate": 1.1822355254921478e-05,
1251
+ "loss": 1.6671,
1252
+ "step": 173
1253
+ },
1254
+ {
1255
+ "epoch": 0.9265329894333971,
1256
+ "grad_norm": 1.0078125,
1257
+ "learning_rate": 1.1736481776669307e-05,
1258
+ "loss": 1.6996,
1259
+ "step": 174
1260
+ },
1261
+ {
1262
+ "epoch": 0.9318578916715201,
1263
+ "grad_norm": 1.0546875,
1264
+ "learning_rate": 1.1650476058606776e-05,
1265
+ "loss": 1.6863,
1266
+ "step": 175
1267
+ },
1268
+ {
1269
+ "epoch": 0.9371827939096431,
1270
+ "grad_norm": 1.1171875,
1271
+ "learning_rate": 1.156434465040231e-05,
1272
+ "loss": 1.7013,
1273
+ "step": 176
1274
+ },
1275
+ {
1276
+ "epoch": 0.9425076961477661,
1277
+ "grad_norm": 0.9921875,
1278
+ "learning_rate": 1.1478094111296109e-05,
1279
+ "loss": 1.7416,
1280
+ "step": 177
1281
+ },
1282
+ {
1283
+ "epoch": 0.947832598385889,
1284
+ "grad_norm": 1.03125,
1285
+ "learning_rate": 1.1391731009600655e-05,
1286
+ "loss": 1.7964,
1287
+ "step": 178
1288
+ },
1289
+ {
1290
+ "epoch": 0.9531575006240119,
1291
+ "grad_norm": 1.046875,
1292
+ "learning_rate": 1.130526192220052e-05,
1293
+ "loss": 1.761,
1294
+ "step": 179
1295
+ },
1296
+ {
1297
+ "epoch": 0.9584824028621349,
1298
+ "grad_norm": 1.0234375,
1299
+ "learning_rate": 1.1218693434051475e-05,
1300
+ "loss": 1.6999,
1301
+ "step": 180
1302
+ },
1303
+ {
1304
+ "epoch": 0.9638073051002579,
1305
+ "grad_norm": 1.0546875,
1306
+ "learning_rate": 1.113203213767907e-05,
1307
+ "loss": 1.69,
1308
+ "step": 181
1309
+ },
1310
+ {
1311
+ "epoch": 0.9691322073383809,
1312
+ "grad_norm": 0.953125,
1313
+ "learning_rate": 1.1045284632676535e-05,
1314
+ "loss": 1.6777,
1315
+ "step": 182
1316
+ },
1317
+ {
1318
+ "epoch": 0.9744571095765039,
1319
+ "grad_norm": 1.0234375,
1320
+ "learning_rate": 1.0958457525202241e-05,
1321
+ "loss": 1.6792,
1322
+ "step": 183
1323
+ },
1324
+ {
1325
+ "epoch": 0.9797820118146269,
1326
+ "grad_norm": 1.03125,
1327
+ "learning_rate": 1.0871557427476585e-05,
1328
+ "loss": 1.6724,
1329
+ "step": 184
1330
+ },
1331
+ {
1332
+ "epoch": 0.9851069140527499,
1333
+ "grad_norm": 1.0625,
1334
+ "learning_rate": 1.0784590957278452e-05,
1335
+ "loss": 1.7415,
1336
+ "step": 185
1337
+ },
1338
+ {
1339
+ "epoch": 0.9904318162908727,
1340
+ "grad_norm": 1.0,
1341
+ "learning_rate": 1.0697564737441254e-05,
1342
+ "loss": 1.6796,
1343
+ "step": 186
1344
+ },
1345
+ {
1346
+ "epoch": 0.9957567185289957,
1347
+ "grad_norm": 1.015625,
1348
+ "learning_rate": 1.0610485395348571e-05,
1349
+ "loss": 1.6186,
1350
+ "step": 187
1351
+ },
1352
+ {
1353
+ "epoch": 1.0010816207671187,
1354
+ "grad_norm": 0.984375,
1355
+ "learning_rate": 1.0523359562429441e-05,
1356
+ "loss": 1.6741,
1357
+ "step": 188
1358
+ },
1359
+ {
1360
+ "epoch": 1.0010816207671187,
1361
+ "eval_loss": 1.6922072172164917,
1362
+ "eval_runtime": 77.3424,
1363
+ "eval_samples_per_second": 15.257,
1364
+ "eval_steps_per_second": 15.257,
1365
+ "step": 188
1366
+ },
1367
+ {
1368
+ "epoch": 1.0064065230052417,
1369
+ "grad_norm": 0.9921875,
1370
+ "learning_rate": 1.0436193873653362e-05,
1371
+ "loss": 1.7202,
1372
+ "step": 189
1373
+ },
1374
+ {
1375
+ "epoch": 1.0016640319494134,
1376
+ "grad_norm": 0.9921875,
1377
+ "learning_rate": 1.0348994967025012e-05,
1378
+ "loss": 1.639,
1379
+ "step": 190
1380
+ },
1381
+ {
1382
+ "epoch": 1.0069889341875364,
1383
+ "grad_norm": 1.0546875,
1384
+ "learning_rate": 1.0261769483078734e-05,
1385
+ "loss": 1.6223,
1386
+ "step": 191
1387
+ },
1388
+ {
1389
+ "epoch": 1.0123138364256594,
1390
+ "grad_norm": 1.0078125,
1391
+ "learning_rate": 1.0174524064372837e-05,
1392
+ "loss": 1.7193,
1393
+ "step": 192
1394
+ },
1395
+ {
1396
+ "epoch": 1.0176387386637824,
1397
+ "grad_norm": 1.0078125,
1398
+ "learning_rate": 1.008726535498374e-05,
1399
+ "loss": 1.6904,
1400
+ "step": 193
1401
+ },
1402
+ {
1403
+ "epoch": 1.0229636409019054,
1404
+ "grad_norm": 1.03125,
1405
+ "learning_rate": 1e-05,
1406
+ "loss": 1.6921,
1407
+ "step": 194
1408
+ },
1409
+ {
1410
+ "epoch": 1.0282885431400284,
1411
+ "grad_norm": 0.9921875,
1412
+ "learning_rate": 9.912734645016262e-06,
1413
+ "loss": 1.6593,
1414
+ "step": 195
1415
+ },
1416
+ {
1417
+ "epoch": 1.0336134453781514,
1418
+ "grad_norm": 0.97265625,
1419
+ "learning_rate": 9.825475935627165e-06,
1420
+ "loss": 1.6469,
1421
+ "step": 196
1422
+ },
1423
+ {
1424
+ "epoch": 1.0389383476162741,
1425
+ "grad_norm": 0.9921875,
1426
+ "learning_rate": 9.738230516921272e-06,
1427
+ "loss": 1.5877,
1428
+ "step": 197
1429
+ },
1430
+ {
1431
+ "epoch": 1.0442632498543971,
1432
+ "grad_norm": 1.03125,
1433
+ "learning_rate": 9.651005032974994e-06,
1434
+ "loss": 1.6726,
1435
+ "step": 198
1436
+ },
1437
+ {
1438
+ "epoch": 1.04958815209252,
1439
+ "grad_norm": 1.0,
1440
+ "learning_rate": 9.563806126346643e-06,
1441
+ "loss": 1.6607,
1442
+ "step": 199
1443
+ },
1444
+ {
1445
+ "epoch": 1.054913054330643,
1446
+ "grad_norm": 1.015625,
1447
+ "learning_rate": 9.476640437570562e-06,
1448
+ "loss": 1.6926,
1449
+ "step": 200
1450
+ },
1451
+ {
1452
+ "epoch": 1.060237956568766,
1453
+ "grad_norm": 1.015625,
1454
+ "learning_rate": 9.38951460465143e-06,
1455
+ "loss": 1.6241,
1456
+ "step": 201
1457
+ },
1458
+ {
1459
+ "epoch": 1.065562858806889,
1460
+ "grad_norm": 0.9921875,
1461
+ "learning_rate": 9.302435262558748e-06,
1462
+ "loss": 1.6431,
1463
+ "step": 202
1464
+ },
1465
+ {
1466
+ "epoch": 1.070887761045012,
1467
+ "grad_norm": 1.0234375,
1468
+ "learning_rate": 9.215409042721553e-06,
1469
+ "loss": 1.6339,
1470
+ "step": 203
1471
+ },
1472
+ {
1473
+ "epoch": 1.076212663283135,
1474
+ "grad_norm": 1.0,
1475
+ "learning_rate": 9.128442572523418e-06,
1476
+ "loss": 1.6701,
1477
+ "step": 204
1478
+ },
1479
+ {
1480
+ "epoch": 1.081537565521258,
1481
+ "grad_norm": 0.98828125,
1482
+ "learning_rate": 9.04154247479776e-06,
1483
+ "loss": 1.6059,
1484
+ "step": 205
1485
+ },
1486
+ {
1487
+ "epoch": 1.086862467759381,
1488
+ "grad_norm": 1.0234375,
1489
+ "learning_rate": 8.954715367323468e-06,
1490
+ "loss": 1.6638,
1491
+ "step": 206
1492
+ },
1493
+ {
1494
+ "epoch": 1.092187369997504,
1495
+ "grad_norm": 1.09375,
1496
+ "learning_rate": 8.867967862320935e-06,
1497
+ "loss": 1.6428,
1498
+ "step": 207
1499
+ },
1500
+ {
1501
+ "epoch": 1.097512272235627,
1502
+ "grad_norm": 0.9765625,
1503
+ "learning_rate": 8.781306565948528e-06,
1504
+ "loss": 1.6424,
1505
+ "step": 208
1506
+ },
1507
+ {
1508
+ "epoch": 1.10283717447375,
1509
+ "grad_norm": 0.984375,
1510
+ "learning_rate": 8.694738077799487e-06,
1511
+ "loss": 1.6105,
1512
+ "step": 209
1513
+ },
1514
+ {
1515
+ "epoch": 1.108162076711873,
1516
+ "grad_norm": 1.0,
1517
+ "learning_rate": 8.60826899039935e-06,
1518
+ "loss": 1.7068,
1519
+ "step": 210
1520
+ },
1521
+ {
1522
+ "epoch": 1.113486978949996,
1523
+ "grad_norm": 1.0859375,
1524
+ "learning_rate": 8.521905888703894e-06,
1525
+ "loss": 1.7118,
1526
+ "step": 211
1527
+ },
1528
+ {
1529
+ "epoch": 1.118811881188119,
1530
+ "grad_norm": 1.0625,
1531
+ "learning_rate": 8.43565534959769e-06,
1532
+ "loss": 1.6553,
1533
+ "step": 212
1534
+ },
1535
+ {
1536
+ "epoch": 1.1241367834262417,
1537
+ "grad_norm": 1.0703125,
1538
+ "learning_rate": 8.349523941393224e-06,
1539
+ "loss": 1.6909,
1540
+ "step": 213
1541
+ },
1542
+ {
1543
+ "epoch": 1.1294616856643647,
1544
+ "grad_norm": 1.0546875,
1545
+ "learning_rate": 8.263518223330698e-06,
1546
+ "loss": 1.7384,
1547
+ "step": 214
1548
+ },
1549
+ {
1550
+ "epoch": 1.1347865879024877,
1551
+ "grad_norm": 1.03125,
1552
+ "learning_rate": 8.177644745078525e-06,
1553
+ "loss": 1.6182,
1554
+ "step": 215
1555
+ },
1556
+ {
1557
+ "epoch": 1.1401114901406106,
1558
+ "grad_norm": 1.125,
1559
+ "learning_rate": 8.091910046234552e-06,
1560
+ "loss": 1.7063,
1561
+ "step": 216
1562
+ },
1563
+ {
1564
+ "epoch": 1.1454363923787336,
1565
+ "grad_norm": 0.96484375,
1566
+ "learning_rate": 8.00632065582803e-06,
1567
+ "loss": 1.6621,
1568
+ "step": 217
1569
+ },
1570
+ {
1571
+ "epoch": 1.1507612946168566,
1572
+ "grad_norm": 0.98828125,
1573
+ "learning_rate": 7.92088309182241e-06,
1574
+ "loss": 1.7231,
1575
+ "step": 218
1576
+ },
1577
+ {
1578
+ "epoch": 1.1560861968549796,
1579
+ "grad_norm": 1.015625,
1580
+ "learning_rate": 7.835603860618973e-06,
1581
+ "loss": 1.6707,
1582
+ "step": 219
1583
+ },
1584
+ {
1585
+ "epoch": 1.1614110990931026,
1586
+ "grad_norm": 1.015625,
1587
+ "learning_rate": 7.750489456561351e-06,
1588
+ "loss": 1.6226,
1589
+ "step": 220
1590
+ },
1591
+ {
1592
+ "epoch": 1.1667360013312256,
1593
+ "grad_norm": 1.0703125,
1594
+ "learning_rate": 7.66554636144095e-06,
1595
+ "loss": 1.6862,
1596
+ "step": 221
1597
+ },
1598
+ {
1599
+ "epoch": 1.1720609035693486,
1600
+ "grad_norm": 1.0625,
1601
+ "learning_rate": 7.580781044003324e-06,
1602
+ "loss": 1.7325,
1603
+ "step": 222
1604
+ },
1605
+ {
1606
+ "epoch": 1.1773858058074715,
1607
+ "grad_norm": 1.03125,
1608
+ "learning_rate": 7.496199959455584e-06,
1609
+ "loss": 1.6573,
1610
+ "step": 223
1611
+ },
1612
+ {
1613
+ "epoch": 1.1827107080455945,
1614
+ "grad_norm": 1.03125,
1615
+ "learning_rate": 7.411809548974792e-06,
1616
+ "loss": 1.6594,
1617
+ "step": 224
1618
+ },
1619
+ {
1620
+ "epoch": 1.1880356102837175,
1621
+ "grad_norm": 1.015625,
1622
+ "learning_rate": 7.327616239217432e-06,
1623
+ "loss": 1.6523,
1624
+ "step": 225
1625
+ },
1626
+ {
1627
+ "epoch": 1.1933605125218405,
1628
+ "grad_norm": 1.0546875,
1629
+ "learning_rate": 7.243626441830009e-06,
1630
+ "loss": 1.6895,
1631
+ "step": 226
1632
+ },
1633
+ {
1634
+ "epoch": 1.1986854147599635,
1635
+ "grad_norm": 0.99609375,
1636
+ "learning_rate": 7.159846552960774e-06,
1637
+ "loss": 1.6883,
1638
+ "step": 227
1639
+ },
1640
+ {
1641
+ "epoch": 1.2040103169980862,
1642
+ "grad_norm": 0.9921875,
1643
+ "learning_rate": 7.076282952772634e-06,
1644
+ "loss": 1.5706,
1645
+ "step": 228
1646
+ },
1647
+ {
1648
+ "epoch": 1.2093352192362095,
1649
+ "grad_norm": 0.95703125,
1650
+ "learning_rate": 6.992942004957271e-06,
1651
+ "loss": 1.5938,
1652
+ "step": 229
1653
+ },
1654
+ {
1655
+ "epoch": 1.2146601214743322,
1656
+ "grad_norm": 1.046875,
1657
+ "learning_rate": 6.909830056250527e-06,
1658
+ "loss": 1.6963,
1659
+ "step": 230
1660
+ },
1661
+ {
1662
+ "epoch": 1.2199850237124552,
1663
+ "grad_norm": 1.0703125,
1664
+ "learning_rate": 6.826953435949081e-06,
1665
+ "loss": 1.6565,
1666
+ "step": 231
1667
+ },
1668
+ {
1669
+ "epoch": 1.2253099259505782,
1670
+ "grad_norm": 1.0078125,
1671
+ "learning_rate": 6.744318455428436e-06,
1672
+ "loss": 1.657,
1673
+ "step": 232
1674
+ },
1675
+ {
1676
+ "epoch": 1.2306348281887012,
1677
+ "grad_norm": 1.09375,
1678
+ "learning_rate": 6.661931407662292e-06,
1679
+ "loss": 1.6942,
1680
+ "step": 233
1681
+ },
1682
+ {
1683
+ "epoch": 1.2359597304268242,
1684
+ "grad_norm": 0.94921875,
1685
+ "learning_rate": 6.579798566743314e-06,
1686
+ "loss": 1.6247,
1687
+ "step": 234
1688
+ },
1689
+ {
1690
+ "epoch": 1.2412846326649472,
1691
+ "grad_norm": 1.0078125,
1692
+ "learning_rate": 6.497926187405326e-06,
1693
+ "loss": 1.6845,
1694
+ "step": 235
1695
+ },
1696
+ {
1697
+ "epoch": 1.2412846326649472,
1698
+ "eval_loss": 1.6904007196426392,
1699
+ "eval_runtime": 77.2338,
1700
+ "eval_samples_per_second": 15.278,
1701
+ "eval_steps_per_second": 15.278,
1702
+ "step": 235
1703
+ },
1704
+ {
1705
+ "epoch": 1.2466095349030701,
1706
+ "grad_norm": 1.046875,
1707
+ "learning_rate": 6.4163205045469975e-06,
1708
+ "loss": 1.6958,
1709
+ "step": 236
1710
+ },
1711
+ {
1712
+ "epoch": 1.2519344371411931,
1713
+ "grad_norm": 1.109375,
1714
+ "learning_rate": 6.334987732757028e-06,
1715
+ "loss": 1.589,
1716
+ "step": 237
1717
+ },
1718
+ {
1719
+ "epoch": 1.257259339379316,
1720
+ "grad_norm": 1.0390625,
1721
+ "learning_rate": 6.25393406584088e-06,
1722
+ "loss": 1.6417,
1723
+ "step": 238
1724
+ },
1725
+ {
1726
+ "epoch": 1.262584241617439,
1727
+ "grad_norm": 0.9609375,
1728
+ "learning_rate": 6.173165676349103e-06,
1729
+ "loss": 1.6412,
1730
+ "step": 239
1731
+ },
1732
+ {
1733
+ "epoch": 1.267909143855562,
1734
+ "grad_norm": 1.046875,
1735
+ "learning_rate": 6.092688715107265e-06,
1736
+ "loss": 1.6539,
1737
+ "step": 240
1738
+ },
1739
+ {
1740
+ "epoch": 1.273234046093685,
1741
+ "grad_norm": 1.03125,
1742
+ "learning_rate": 6.0125093107475385e-06,
1743
+ "loss": 1.6194,
1744
+ "step": 241
1745
+ },
1746
+ {
1747
+ "epoch": 1.278558948331808,
1748
+ "grad_norm": 0.9921875,
1749
+ "learning_rate": 5.932633569242e-06,
1750
+ "loss": 1.6566,
1751
+ "step": 242
1752
+ },
1753
+ {
1754
+ "epoch": 1.2838838505699308,
1755
+ "grad_norm": 1.0234375,
1756
+ "learning_rate": 5.853067573437612e-06,
1757
+ "loss": 1.6734,
1758
+ "step": 243
1759
+ },
1760
+ {
1761
+ "epoch": 1.289208752808054,
1762
+ "grad_norm": 1.046875,
1763
+ "learning_rate": 5.773817382593008e-06,
1764
+ "loss": 1.7109,
1765
+ "step": 244
1766
+ },
1767
+ {
1768
+ "epoch": 1.2945336550461768,
1769
+ "grad_norm": 1.015625,
1770
+ "learning_rate": 5.694889031917047e-06,
1771
+ "loss": 1.5984,
1772
+ "step": 245
1773
+ },
1774
+ {
1775
+ "epoch": 1.2998585572842998,
1776
+ "grad_norm": 1.046875,
1777
+ "learning_rate": 5.616288532109225e-06,
1778
+ "loss": 1.7059,
1779
+ "step": 246
1780
+ },
1781
+ {
1782
+ "epoch": 1.3051834595224228,
1783
+ "grad_norm": 1.0390625,
1784
+ "learning_rate": 5.5380218689019125e-06,
1785
+ "loss": 1.6666,
1786
+ "step": 247
1787
+ },
1788
+ {
1789
+ "epoch": 1.3105083617605457,
1790
+ "grad_norm": 1.0234375,
1791
+ "learning_rate": 5.460095002604533e-06,
1792
+ "loss": 1.6752,
1793
+ "step": 248
1794
+ },
1795
+ {
1796
+ "epoch": 1.3158332639986687,
1797
+ "grad_norm": 1.0625,
1798
+ "learning_rate": 5.382513867649663e-06,
1799
+ "loss": 1.684,
1800
+ "step": 249
1801
+ },
1802
+ {
1803
+ "epoch": 1.3211581662367917,
1804
+ "grad_norm": 0.98046875,
1805
+ "learning_rate": 5.305284372141095e-06,
1806
+ "loss": 1.6072,
1807
+ "step": 250
1808
+ },
1809
+ {
1810
+ "epoch": 1.3264830684749147,
1811
+ "grad_norm": 0.9296875,
1812
+ "learning_rate": 5.228412397403916e-06,
1813
+ "loss": 1.4978,
1814
+ "step": 251
1815
+ },
1816
+ {
1817
+ "epoch": 1.3318079707130377,
1818
+ "grad_norm": 1.0390625,
1819
+ "learning_rate": 5.151903797536631e-06,
1820
+ "loss": 1.6164,
1821
+ "step": 252
1822
+ },
1823
+ {
1824
+ "epoch": 1.3371328729511607,
1825
+ "grad_norm": 1.0703125,
1826
+ "learning_rate": 5.075764398965331e-06,
1827
+ "loss": 1.5904,
1828
+ "step": 253
1829
+ },
1830
+ {
1831
+ "epoch": 1.3424577751892837,
1832
+ "grad_norm": 1.0,
1833
+ "learning_rate": 5.000000000000003e-06,
1834
+ "loss": 1.5924,
1835
+ "step": 254
1836
+ },
1837
+ {
1838
+ "epoch": 1.3477826774274067,
1839
+ "grad_norm": 1.0390625,
1840
+ "learning_rate": 4.924616370392962e-06,
1841
+ "loss": 1.6521,
1842
+ "step": 255
1843
+ },
1844
+ {
1845
+ "epoch": 1.3531075796655296,
1846
+ "grad_norm": 1.0234375,
1847
+ "learning_rate": 4.849619250899458e-06,
1848
+ "loss": 1.6124,
1849
+ "step": 256
1850
+ },
1851
+ {
1852
+ "epoch": 1.3584324819036526,
1853
+ "grad_norm": 1.0234375,
1854
+ "learning_rate": 4.775014352840512e-06,
1855
+ "loss": 1.6634,
1856
+ "step": 257
1857
+ },
1858
+ {
1859
+ "epoch": 1.3637573841417754,
1860
+ "grad_norm": 0.984375,
1861
+ "learning_rate": 4.700807357667953e-06,
1862
+ "loss": 1.628,
1863
+ "step": 258
1864
+ },
1865
+ {
1866
+ "epoch": 1.3690822863798986,
1867
+ "grad_norm": 1.046875,
1868
+ "learning_rate": 4.627003916531761e-06,
1869
+ "loss": 1.7282,
1870
+ "step": 259
1871
+ },
1872
+ {
1873
+ "epoch": 1.3744071886180214,
1874
+ "grad_norm": 1.0625,
1875
+ "learning_rate": 4.5536096498497295e-06,
1876
+ "loss": 1.7081,
1877
+ "step": 260
1878
+ },
1879
+ {
1880
+ "epoch": 1.3797320908561446,
1881
+ "grad_norm": 1.03125,
1882
+ "learning_rate": 4.480630146879419e-06,
1883
+ "loss": 1.6772,
1884
+ "step": 261
1885
+ },
1886
+ {
1887
+ "epoch": 1.3850569930942673,
1888
+ "grad_norm": 1.0234375,
1889
+ "learning_rate": 4.408070965292534e-06,
1890
+ "loss": 1.695,
1891
+ "step": 262
1892
+ },
1893
+ {
1894
+ "epoch": 1.3903818953323903,
1895
+ "grad_norm": 0.97265625,
1896
+ "learning_rate": 4.335937630751675e-06,
1897
+ "loss": 1.6414,
1898
+ "step": 263
1899
+ },
1900
+ {
1901
+ "epoch": 1.3957067975705133,
1902
+ "grad_norm": 1.015625,
1903
+ "learning_rate": 4.264235636489542e-06,
1904
+ "loss": 1.6425,
1905
+ "step": 264
1906
+ },
1907
+ {
1908
+ "epoch": 1.4010316998086363,
1909
+ "grad_norm": 1.0390625,
1910
+ "learning_rate": 4.192970442890602e-06,
1911
+ "loss": 1.6466,
1912
+ "step": 265
1913
+ },
1914
+ {
1915
+ "epoch": 1.4063566020467593,
1916
+ "grad_norm": 1.0,
1917
+ "learning_rate": 4.12214747707527e-06,
1918
+ "loss": 1.6378,
1919
+ "step": 266
1920
+ },
1921
+ {
1922
+ "epoch": 1.4116815042848823,
1923
+ "grad_norm": 1.015625,
1924
+ "learning_rate": 4.051772132486589e-06,
1925
+ "loss": 1.718,
1926
+ "step": 267
1927
+ },
1928
+ {
1929
+ "epoch": 1.4170064065230052,
1930
+ "grad_norm": 1.1015625,
1931
+ "learning_rate": 3.981849768479516e-06,
1932
+ "loss": 1.6534,
1933
+ "step": 268
1934
+ },
1935
+ {
1936
+ "epoch": 1.4223313087611282,
1937
+ "grad_norm": 0.98046875,
1938
+ "learning_rate": 3.912385709912794e-06,
1939
+ "loss": 1.6769,
1940
+ "step": 269
1941
+ },
1942
+ {
1943
+ "epoch": 1.4276562109992512,
1944
+ "grad_norm": 1.046875,
1945
+ "learning_rate": 3.8433852467434175e-06,
1946
+ "loss": 1.6337,
1947
+ "step": 270
1948
+ },
1949
+ {
1950
+ "epoch": 1.4329811132373742,
1951
+ "grad_norm": 1.0625,
1952
+ "learning_rate": 3.774853633623806e-06,
1953
+ "loss": 1.7151,
1954
+ "step": 271
1955
+ },
1956
+ {
1957
+ "epoch": 1.4383060154754972,
1958
+ "grad_norm": 1.0234375,
1959
+ "learning_rate": 3.7067960895016277e-06,
1960
+ "loss": 1.6018,
1961
+ "step": 272
1962
+ },
1963
+ {
1964
+ "epoch": 1.4436309177136202,
1965
+ "grad_norm": 1.03125,
1966
+ "learning_rate": 3.6392177972223596e-06,
1967
+ "loss": 1.597,
1968
+ "step": 273
1969
+ },
1970
+ {
1971
+ "epoch": 1.4489558199517432,
1972
+ "grad_norm": 1.0390625,
1973
+ "learning_rate": 3.5721239031346067e-06,
1974
+ "loss": 1.6465,
1975
+ "step": 274
1976
+ },
1977
+ {
1978
+ "epoch": 1.454280722189866,
1979
+ "grad_norm": 1.03125,
1980
+ "learning_rate": 3.505519516698165e-06,
1981
+ "loss": 1.7173,
1982
+ "step": 275
1983
+ },
1984
+ {
1985
+ "epoch": 1.4596056244279891,
1986
+ "grad_norm": 1.015625,
1987
+ "learning_rate": 3.4394097100949286e-06,
1988
+ "loss": 1.5711,
1989
+ "step": 276
1990
+ },
1991
+ {
1992
+ "epoch": 1.464930526666112,
1993
+ "grad_norm": 1.03125,
1994
+ "learning_rate": 3.3737995178426276e-06,
1995
+ "loss": 1.7184,
1996
+ "step": 277
1997
+ },
1998
+ {
1999
+ "epoch": 1.4702554289042349,
2000
+ "grad_norm": 1.0234375,
2001
+ "learning_rate": 3.308693936411421e-06,
2002
+ "loss": 1.6222,
2003
+ "step": 278
2004
+ },
2005
+ {
2006
+ "epoch": 1.4755803311423579,
2007
+ "grad_norm": 0.99609375,
2008
+ "learning_rate": 3.2440979238433977e-06,
2009
+ "loss": 1.7164,
2010
+ "step": 279
2011
+ },
2012
+ {
2013
+ "epoch": 1.4809052333804809,
2014
+ "grad_norm": 1.03125,
2015
+ "learning_rate": 3.1800163993750166e-06,
2016
+ "loss": 1.8001,
2017
+ "step": 280
2018
+ },
2019
+ {
2020
+ "epoch": 1.4862301356186038,
2021
+ "grad_norm": 1.125,
2022
+ "learning_rate": 3.116454243062459e-06,
2023
+ "loss": 1.6933,
2024
+ "step": 281
2025
+ },
2026
+ {
2027
+ "epoch": 1.4915550378567268,
2028
+ "grad_norm": 1.0234375,
2029
+ "learning_rate": 3.0534162954100264e-06,
2030
+ "loss": 1.6367,
2031
+ "step": 282
2032
+ },
2033
+ {
2034
+ "epoch": 1.4915550378567268,
2035
+ "eval_loss": 1.6898616552352905,
2036
+ "eval_runtime": 78.3804,
2037
+ "eval_samples_per_second": 15.055,
2038
+ "eval_steps_per_second": 15.055,
2039
+ "step": 282
2040
+ },
2041
+ {
2042
+ "epoch": 1.4968799400948498,
2043
+ "grad_norm": 1.015625,
2044
+ "learning_rate": 2.990907357001491e-06,
2045
+ "loss": 1.6462,
2046
+ "step": 283
2047
+ },
2048
+ {
2049
+ "epoch": 1.5022048423329728,
2050
+ "grad_norm": 0.9921875,
2051
+ "learning_rate": 2.9289321881345257e-06,
2052
+ "loss": 1.6232,
2053
+ "step": 284
2054
+ },
2055
+ {
2056
+ "epoch": 1.5075297445710958,
2057
+ "grad_norm": 1.015625,
2058
+ "learning_rate": 2.867495508458186e-06,
2059
+ "loss": 1.7229,
2060
+ "step": 285
2061
+ },
2062
+ {
2063
+ "epoch": 1.5128546468092188,
2064
+ "grad_norm": 1.03125,
2065
+ "learning_rate": 2.8066019966134907e-06,
2066
+ "loss": 1.6581,
2067
+ "step": 286
2068
+ },
2069
+ {
2070
+ "epoch": 1.5181795490473418,
2071
+ "grad_norm": 0.98046875,
2072
+ "learning_rate": 2.746256289877126e-06,
2073
+ "loss": 1.6565,
2074
+ "step": 287
2075
+ },
2076
+ {
2077
+ "epoch": 1.5235044512854645,
2078
+ "grad_norm": 1.015625,
2079
+ "learning_rate": 2.6864629838082957e-06,
2080
+ "loss": 1.646,
2081
+ "step": 288
2082
+ },
2083
+ {
2084
+ "epoch": 1.5288293535235877,
2085
+ "grad_norm": 0.97265625,
2086
+ "learning_rate": 2.6272266318987606e-06,
2087
+ "loss": 1.6165,
2088
+ "step": 289
2089
+ },
2090
+ {
2091
+ "epoch": 1.5341542557617105,
2092
+ "grad_norm": 0.953125,
2093
+ "learning_rate": 2.5685517452260566e-06,
2094
+ "loss": 1.6342,
2095
+ "step": 290
2096
+ },
2097
+ {
2098
+ "epoch": 1.5394791579998337,
2099
+ "grad_norm": 1.0390625,
2100
+ "learning_rate": 2.5104427921099783e-06,
2101
+ "loss": 1.6765,
2102
+ "step": 291
2103
+ },
2104
+ {
2105
+ "epoch": 1.5448040602379565,
2106
+ "grad_norm": 1.0078125,
2107
+ "learning_rate": 2.45290419777228e-06,
2108
+ "loss": 1.6634,
2109
+ "step": 292
2110
+ },
2111
+ {
2112
+ "epoch": 1.5501289624760797,
2113
+ "grad_norm": 0.98828125,
2114
+ "learning_rate": 2.395940343999691e-06,
2115
+ "loss": 1.6478,
2116
+ "step": 293
2117
+ },
2118
+ {
2119
+ "epoch": 1.5554538647142024,
2120
+ "grad_norm": 1.0546875,
2121
+ "learning_rate": 2.339555568810221e-06,
2122
+ "loss": 1.6321,
2123
+ "step": 294
2124
+ },
2125
+ {
2126
+ "epoch": 1.5607787669523256,
2127
+ "grad_norm": 1.03125,
2128
+ "learning_rate": 2.2837541661228024e-06,
2129
+ "loss": 1.7033,
2130
+ "step": 295
2131
+ },
2132
+ {
2133
+ "epoch": 1.5661036691904484,
2134
+ "grad_norm": 0.95703125,
2135
+ "learning_rate": 2.2285403854302912e-06,
2136
+ "loss": 1.562,
2137
+ "step": 296
2138
+ },
2139
+ {
2140
+ "epoch": 1.5714285714285714,
2141
+ "grad_norm": 1.0078125,
2142
+ "learning_rate": 2.173918431475861e-06,
2143
+ "loss": 1.6686,
2144
+ "step": 297
2145
+ },
2146
+ {
2147
+ "epoch": 1.5767534736666944,
2148
+ "grad_norm": 1.015625,
2149
+ "learning_rate": 2.119892463932781e-06,
2150
+ "loss": 1.5676,
2151
+ "step": 298
2152
+ },
2153
+ {
2154
+ "epoch": 1.5820783759048174,
2155
+ "grad_norm": 0.9921875,
2156
+ "learning_rate": 2.0664665970876496e-06,
2157
+ "loss": 1.6761,
2158
+ "step": 299
2159
+ },
2160
+ {
2161
+ "epoch": 1.5874032781429404,
2162
+ "grad_norm": 1.015625,
2163
+ "learning_rate": 2.013644899527074e-06,
2164
+ "loss": 1.6786,
2165
+ "step": 300
2166
+ },
2167
+ {
2168
+ "epoch": 1.5927281803810633,
2169
+ "grad_norm": 1.0625,
2170
+ "learning_rate": 1.961431393827827e-06,
2171
+ "loss": 1.6917,
2172
+ "step": 301
2173
+ },
2174
+ {
2175
+ "epoch": 1.5980530826191863,
2176
+ "grad_norm": 1.0,
2177
+ "learning_rate": 1.9098300562505266e-06,
2178
+ "loss": 1.6204,
2179
+ "step": 302
2180
+ },
2181
+ {
2182
+ "epoch": 1.6033779848573093,
2183
+ "grad_norm": 1.046875,
2184
+ "learning_rate": 1.858844816436809e-06,
2185
+ "loss": 1.6689,
2186
+ "step": 303
2187
+ },
2188
+ {
2189
+ "epoch": 1.6087028870954323,
2190
+ "grad_norm": 0.98046875,
2191
+ "learning_rate": 1.808479557110081e-06,
2192
+ "loss": 1.7041,
2193
+ "step": 304
2194
+ },
2195
+ {
2196
+ "epoch": 1.614027789333555,
2197
+ "grad_norm": 0.96875,
2198
+ "learning_rate": 1.7587381137798432e-06,
2199
+ "loss": 1.6656,
2200
+ "step": 305
2201
+ },
2202
+ {
2203
+ "epoch": 1.6193526915716783,
2204
+ "grad_norm": 1.0,
2205
+ "learning_rate": 1.709624274449584e-06,
2206
+ "loss": 1.6269,
2207
+ "step": 306
2208
+ },
2209
+ {
2210
+ "epoch": 1.624677593809801,
2211
+ "grad_norm": 1.03125,
2212
+ "learning_rate": 1.6611417793283192e-06,
2213
+ "loss": 1.5737,
2214
+ "step": 307
2215
+ },
2216
+ {
2217
+ "epoch": 1.6300024960479242,
2218
+ "grad_norm": 1.0078125,
2219
+ "learning_rate": 1.6132943205457607e-06,
2220
+ "loss": 1.722,
2221
+ "step": 308
2222
+ },
2223
+ {
2224
+ "epoch": 1.635327398286047,
2225
+ "grad_norm": 0.99609375,
2226
+ "learning_rate": 1.566085541871145e-06,
2227
+ "loss": 1.683,
2228
+ "step": 309
2229
+ },
2230
+ {
2231
+ "epoch": 1.6406523005241702,
2232
+ "grad_norm": 1.0546875,
2233
+ "learning_rate": 1.5195190384357405e-06,
2234
+ "loss": 1.6843,
2235
+ "step": 310
2236
+ },
2237
+ {
2238
+ "epoch": 1.645977202762293,
2239
+ "grad_norm": 1.0390625,
2240
+ "learning_rate": 1.4735983564590784e-06,
2241
+ "loss": 1.6979,
2242
+ "step": 311
2243
+ },
2244
+ {
2245
+ "epoch": 1.651302105000416,
2246
+ "grad_norm": 1.015625,
2247
+ "learning_rate": 1.4283269929788779e-06,
2248
+ "loss": 1.6396,
2249
+ "step": 312
2250
+ },
2251
+ {
2252
+ "epoch": 1.656627007238539,
2253
+ "grad_norm": 0.99609375,
2254
+ "learning_rate": 1.3837083955847418e-06,
2255
+ "loss": 1.6738,
2256
+ "step": 313
2257
+ },
2258
+ {
2259
+ "epoch": 1.661951909476662,
2260
+ "grad_norm": 0.94921875,
2261
+ "learning_rate": 1.339745962155613e-06,
2262
+ "loss": 1.6566,
2263
+ "step": 314
2264
+ },
2265
+ {
2266
+ "epoch": 1.667276811714785,
2267
+ "grad_norm": 1.0078125,
2268
+ "learning_rate": 1.2964430406010032e-06,
2269
+ "loss": 1.6679,
2270
+ "step": 315
2271
+ },
2272
+ {
2273
+ "epoch": 1.672601713952908,
2274
+ "grad_norm": 1.0,
2275
+ "learning_rate": 1.2538029286060428e-06,
2276
+ "loss": 1.6892,
2277
+ "step": 316
2278
+ },
2279
+ {
2280
+ "epoch": 1.677926616191031,
2281
+ "grad_norm": 1.0390625,
2282
+ "learning_rate": 1.2118288733803474e-06,
2283
+ "loss": 1.6914,
2284
+ "step": 317
2285
+ },
2286
+ {
2287
+ "epoch": 1.6832515184291539,
2288
+ "grad_norm": 1.015625,
2289
+ "learning_rate": 1.1705240714107301e-06,
2290
+ "loss": 1.5954,
2291
+ "step": 318
2292
+ },
2293
+ {
2294
+ "epoch": 1.6885764206672769,
2295
+ "grad_norm": 0.9765625,
2296
+ "learning_rate": 1.129891668217783e-06,
2297
+ "loss": 1.633,
2298
+ "step": 319
2299
+ },
2300
+ {
2301
+ "epoch": 1.6939013229053996,
2302
+ "grad_norm": 1.0546875,
2303
+ "learning_rate": 1.0899347581163222e-06,
2304
+ "loss": 1.6976,
2305
+ "step": 320
2306
+ },
2307
+ {
2308
+ "epoch": 1.6992262251435228,
2309
+ "grad_norm": 0.9921875,
2310
+ "learning_rate": 1.0506563839797501e-06,
2311
+ "loss": 1.6283,
2312
+ "step": 321
2313
+ },
2314
+ {
2315
+ "epoch": 1.7045511273816456,
2316
+ "grad_norm": 0.9765625,
2317
+ "learning_rate": 1.012059537008332e-06,
2318
+ "loss": 1.6094,
2319
+ "step": 322
2320
+ },
2321
+ {
2322
+ "epoch": 1.7098760296197688,
2323
+ "grad_norm": 1.0390625,
2324
+ "learning_rate": 9.74147156501396e-07,
2325
+ "loss": 1.6049,
2326
+ "step": 323
2327
+ },
2328
+ {
2329
+ "epoch": 1.7152009318578916,
2330
+ "grad_norm": 1.0546875,
2331
+ "learning_rate": 9.369221296335007e-07,
2332
+ "loss": 1.6529,
2333
+ "step": 324
2334
+ },
2335
+ {
2336
+ "epoch": 1.7205258340960148,
2337
+ "grad_norm": 1.09375,
2338
+ "learning_rate": 9.00387291234569e-07,
2339
+ "loss": 1.7539,
2340
+ "step": 325
2341
+ },
2342
+ {
2343
+ "epoch": 1.7258507363341375,
2344
+ "grad_norm": 1.03125,
2345
+ "learning_rate": 8.645454235739903e-07,
2346
+ "loss": 1.6844,
2347
+ "step": 326
2348
+ },
2349
+ {
2350
+ "epoch": 1.7311756385722605,
2351
+ "grad_norm": 0.98046875,
2352
+ "learning_rate": 8.293992561487596e-07,
2353
+ "loss": 1.6409,
2354
+ "step": 327
2355
+ },
2356
+ {
2357
+ "epoch": 1.7365005408103835,
2358
+ "grad_norm": 1.03125,
2359
+ "learning_rate": 7.949514654755963e-07,
2360
+ "loss": 1.7356,
2361
+ "step": 328
2362
+ },
2363
+ {
2364
+ "epoch": 1.7418254430485065,
2365
+ "grad_norm": 1.0625,
2366
+ "learning_rate": 7.612046748871327e-07,
2367
+ "loss": 1.6681,
2368
+ "step": 329
2369
+ },
2370
+ {
2371
+ "epoch": 1.7418254430485065,
2372
+ "eval_loss": 1.6896613836288452,
2373
+ "eval_runtime": 77.1918,
2374
+ "eval_samples_per_second": 15.287,
2375
+ "eval_steps_per_second": 15.287,
2376
+ "step": 329
2377
+ },
2378
+ {
2379
+ "epoch": 1.7471503452866295,
2380
+ "grad_norm": 1.046875,
2381
+ "learning_rate": 7.281614543321269e-07,
2382
+ "loss": 1.6493,
2383
+ "step": 330
2384
+ },
2385
+ {
2386
+ "epoch": 1.7524752475247525,
2387
+ "grad_norm": 1.0234375,
2388
+ "learning_rate": 6.958243201797554e-07,
2389
+ "loss": 1.6299,
2390
+ "step": 331
2391
+ },
2392
+ {
2393
+ "epoch": 1.7578001497628755,
2394
+ "grad_norm": 1.0234375,
2395
+ "learning_rate": 6.641957350279838e-07,
2396
+ "loss": 1.6882,
2397
+ "step": 332
2398
+ },
2399
+ {
2400
+ "epoch": 1.7631250520009984,
2401
+ "grad_norm": 1.0390625,
2402
+ "learning_rate": 6.332781075160244e-07,
2403
+ "loss": 1.6256,
2404
+ "step": 333
2405
+ },
2406
+ {
2407
+ "epoch": 1.7684499542391214,
2408
+ "grad_norm": 1.0234375,
2409
+ "learning_rate": 6.030737921409169e-07,
2410
+ "loss": 1.5845,
2411
+ "step": 334
2412
+ },
2413
+ {
2414
+ "epoch": 1.7737748564772442,
2415
+ "grad_norm": 0.984375,
2416
+ "learning_rate": 5.735850890782158e-07,
2417
+ "loss": 1.6066,
2418
+ "step": 335
2419
+ },
2420
+ {
2421
+ "epoch": 1.7790997587153674,
2422
+ "grad_norm": 1.03125,
2423
+ "learning_rate": 5.448142440068316e-07,
2424
+ "loss": 1.6751,
2425
+ "step": 336
2426
+ },
2427
+ {
2428
+ "epoch": 1.7844246609534902,
2429
+ "grad_norm": 0.97265625,
2430
+ "learning_rate": 5.167634479380068e-07,
2431
+ "loss": 1.6368,
2432
+ "step": 337
2433
+ },
2434
+ {
2435
+ "epoch": 1.7897495631916134,
2436
+ "grad_norm": 1.0234375,
2437
+ "learning_rate": 4.894348370484648e-07,
2438
+ "loss": 1.7731,
2439
+ "step": 338
2440
+ },
2441
+ {
2442
+ "epoch": 1.7950744654297361,
2443
+ "grad_norm": 1.03125,
2444
+ "learning_rate": 4.628304925177318e-07,
2445
+ "loss": 1.6714,
2446
+ "step": 339
2447
+ },
2448
+ {
2449
+ "epoch": 1.8003993676678594,
2450
+ "grad_norm": 1.1015625,
2451
+ "learning_rate": 4.3695244036964567e-07,
2452
+ "loss": 1.7154,
2453
+ "step": 340
2454
+ },
2455
+ {
2456
+ "epoch": 1.8057242699059821,
2457
+ "grad_norm": 0.94921875,
2458
+ "learning_rate": 4.118026513180695e-07,
2459
+ "loss": 1.5225,
2460
+ "step": 341
2461
+ },
2462
+ {
2463
+ "epoch": 1.8110491721441053,
2464
+ "grad_norm": 1.0,
2465
+ "learning_rate": 3.8738304061681107e-07,
2466
+ "loss": 1.6143,
2467
+ "step": 342
2468
+ },
2469
+ {
2470
+ "epoch": 1.816374074382228,
2471
+ "grad_norm": 1.0,
2472
+ "learning_rate": 3.6369546791377054e-07,
2473
+ "loss": 1.6301,
2474
+ "step": 343
2475
+ },
2476
+ {
2477
+ "epoch": 1.821698976620351,
2478
+ "grad_norm": 0.95703125,
2479
+ "learning_rate": 3.4074173710931804e-07,
2480
+ "loss": 1.5861,
2481
+ "step": 344
2482
+ },
2483
+ {
2484
+ "epoch": 1.827023878858474,
2485
+ "grad_norm": 0.9921875,
2486
+ "learning_rate": 3.185235962189237e-07,
2487
+ "loss": 1.6469,
2488
+ "step": 345
2489
+ },
2490
+ {
2491
+ "epoch": 1.832348781096597,
2492
+ "grad_norm": 0.95703125,
2493
+ "learning_rate": 2.970427372400353e-07,
2494
+ "loss": 1.5941,
2495
+ "step": 346
2496
+ },
2497
+ {
2498
+ "epoch": 1.83767368333472,
2499
+ "grad_norm": 0.9609375,
2500
+ "learning_rate": 2.7630079602323447e-07,
2501
+ "loss": 1.6405,
2502
+ "step": 347
2503
+ },
2504
+ {
2505
+ "epoch": 1.842998585572843,
2506
+ "grad_norm": 1.03125,
2507
+ "learning_rate": 2.5629935214764866e-07,
2508
+ "loss": 1.6329,
2509
+ "step": 348
2510
+ },
2511
+ {
2512
+ "epoch": 1.848323487810966,
2513
+ "grad_norm": 1.03125,
2514
+ "learning_rate": 2.370399288006664e-07,
2515
+ "loss": 1.62,
2516
+ "step": 349
2517
+ },
2518
+ {
2519
+ "epoch": 1.853648390049089,
2520
+ "grad_norm": 1.0078125,
2521
+ "learning_rate": 2.1852399266194312e-07,
2522
+ "loss": 1.5775,
2523
+ "step": 350
2524
+ },
2525
+ {
2526
+ "epoch": 1.858973292287212,
2527
+ "grad_norm": 0.984375,
2528
+ "learning_rate": 2.0075295379170413e-07,
2529
+ "loss": 1.6226,
2530
+ "step": 351
2531
+ },
2532
+ {
2533
+ "epoch": 1.8642981945253347,
2534
+ "grad_norm": 1.0234375,
2535
+ "learning_rate": 1.8372816552336025e-07,
2536
+ "loss": 1.5957,
2537
+ "step": 352
2538
+ },
2539
+ {
2540
+ "epoch": 1.869623096763458,
2541
+ "grad_norm": 1.015625,
2542
+ "learning_rate": 1.6745092436045495e-07,
2543
+ "loss": 1.6389,
2544
+ "step": 353
2545
+ },
2546
+ {
2547
+ "epoch": 1.8749479990015807,
2548
+ "grad_norm": 1.0546875,
2549
+ "learning_rate": 1.519224698779198e-07,
2550
+ "loss": 1.6588,
2551
+ "step": 354
2552
+ },
2553
+ {
2554
+ "epoch": 1.880272901239704,
2555
+ "grad_norm": 0.9765625,
2556
+ "learning_rate": 1.3714398462768563e-07,
2557
+ "loss": 1.5915,
2558
+ "step": 355
2559
+ },
2560
+ {
2561
+ "epoch": 1.8855978034778267,
2562
+ "grad_norm": 1.03125,
2563
+ "learning_rate": 1.231165940486234e-07,
2564
+ "loss": 1.6454,
2565
+ "step": 356
2566
+ },
2567
+ {
2568
+ "epoch": 1.89092270571595,
2569
+ "grad_norm": 1.03125,
2570
+ "learning_rate": 1.0984136638083176e-07,
2571
+ "loss": 1.6706,
2572
+ "step": 357
2573
+ },
2574
+ {
2575
+ "epoch": 1.8962476079540727,
2576
+ "grad_norm": 1.046875,
2577
+ "learning_rate": 9.731931258429638e-08,
2578
+ "loss": 1.6479,
2579
+ "step": 358
2580
+ },
2581
+ {
2582
+ "epoch": 1.9015725101921956,
2583
+ "grad_norm": 0.93359375,
2584
+ "learning_rate": 8.555138626189619e-08,
2585
+ "loss": 1.6405,
2586
+ "step": 359
2587
+ },
2588
+ {
2589
+ "epoch": 1.9068974124303186,
2590
+ "grad_norm": 1.0234375,
2591
+ "learning_rate": 7.453848358678018e-08,
2592
+ "loss": 1.6638,
2593
+ "step": 360
2594
+ },
2595
+ {
2596
+ "epoch": 1.9122223146684416,
2597
+ "grad_norm": 1.046875,
2598
+ "learning_rate": 6.428144323412544e-08,
2599
+ "loss": 1.6709,
2600
+ "step": 361
2601
+ },
2602
+ {
2603
+ "epoch": 1.9175472169065646,
2604
+ "grad_norm": 1.0546875,
2605
+ "learning_rate": 5.4781046317267103e-08,
2606
+ "loss": 1.6191,
2607
+ "step": 362
2608
+ },
2609
+ {
2610
+ "epoch": 1.9228721191446876,
2611
+ "grad_norm": 1.0390625,
2612
+ "learning_rate": 4.603801632821148e-08,
2613
+ "loss": 1.7144,
2614
+ "step": 363
2615
+ },
2616
+ {
2617
+ "epoch": 1.9281970213828106,
2618
+ "grad_norm": 1.0078125,
2619
+ "learning_rate": 3.805301908254455e-08,
2620
+ "loss": 1.6358,
2621
+ "step": 364
2622
+ },
2623
+ {
2624
+ "epoch": 1.9335219236209336,
2625
+ "grad_norm": 1.0078125,
2626
+ "learning_rate": 3.082666266872036e-08,
2627
+ "loss": 1.6961,
2628
+ "step": 365
2629
+ },
2630
+ {
2631
+ "epoch": 1.9388468258590565,
2632
+ "grad_norm": 1.046875,
2633
+ "learning_rate": 2.4359497401758026e-08,
2634
+ "loss": 1.6074,
2635
+ "step": 366
2636
+ },
2637
+ {
2638
+ "epoch": 1.9441717280971793,
2639
+ "grad_norm": 1.03125,
2640
+ "learning_rate": 1.86520157813308e-08,
2641
+ "loss": 1.5905,
2642
+ "step": 367
2643
+ },
2644
+ {
2645
+ "epoch": 1.9494966303353025,
2646
+ "grad_norm": 0.97265625,
2647
+ "learning_rate": 1.370465245426167e-08,
2648
+ "loss": 1.6317,
2649
+ "step": 368
2650
+ },
2651
+ {
2652
+ "epoch": 1.9548215325734253,
2653
+ "grad_norm": 0.98828125,
2654
+ "learning_rate": 9.517784181422018e-09,
2655
+ "loss": 1.5783,
2656
+ "step": 369
2657
+ },
2658
+ {
2659
+ "epoch": 1.9601464348115485,
2660
+ "grad_norm": 0.98828125,
2661
+ "learning_rate": 6.091729809042379e-09,
2662
+ "loss": 1.7104,
2663
+ "step": 370
2664
+ },
2665
+ {
2666
+ "epoch": 1.9654713370496713,
2667
+ "grad_norm": 1.0546875,
2668
+ "learning_rate": 3.4267502444274013e-09,
2669
+ "loss": 1.7328,
2670
+ "step": 371
2671
+ },
2672
+ {
2673
+ "epoch": 1.9707962392877945,
2674
+ "grad_norm": 1.03125,
2675
+ "learning_rate": 1.5230484360873043e-09,
2676
+ "loss": 1.6749,
2677
+ "step": 372
2678
+ },
2679
+ {
2680
+ "epoch": 1.9761211415259172,
2681
+ "grad_norm": 0.98046875,
2682
+ "learning_rate": 3.807693582869032e-10,
2683
+ "loss": 1.5706,
2684
+ "step": 373
2685
+ },
2686
+ {
2687
+ "epoch": 1.9814460437640404,
2688
+ "grad_norm": 1.0703125,
2689
+ "learning_rate": 0.0,
2690
+ "loss": 1.662,
2691
+ "step": 374
2692
+ }
2693
+ ],
2694
+ "logging_steps": 1,
2695
+ "max_steps": 374,
2696
+ "num_input_tokens_seen": 0,
2697
+ "num_train_epochs": 2,
2698
+ "save_steps": 187,
2699
+ "stateful_callbacks": {
2700
+ "TrainerControl": {
2701
+ "args": {
2702
+ "should_epoch_stop": false,
2703
+ "should_evaluate": false,
2704
+ "should_log": false,
2705
+ "should_save": true,
2706
+ "should_training_stop": true
2707
+ },
2708
+ "attributes": {}
2709
+ }
2710
+ },
2711
+ "total_flos": 3.8540467560146534e+17,
2712
+ "train_batch_size": 1,
2713
+ "trial_name": null,
2714
+ "trial_params": null
2715
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35b4b979a5cff4130ad7b93748ff9ed06a157ef9e649ffec51ce3ecfa2d21603
3
+ size 6136
vocab.json ADDED
The diff for this file is too large to render. See raw diff