amphora commited on
Commit
ad3b4da
·
verified ·
1 Parent(s): 3c8638a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. README.md +165 -0
  3. added_tokens.json +24 -0
  4. checkpoint-127/added_tokens.json +24 -0
  5. checkpoint-127/config.json +28 -0
  6. checkpoint-127/generation_config.json +14 -0
  7. checkpoint-127/latest +1 -0
  8. checkpoint-127/merges.txt +0 -0
  9. checkpoint-127/model.safetensors +3 -0
  10. checkpoint-127/rng_state_0.pth +3 -0
  11. checkpoint-127/rng_state_1.pth +3 -0
  12. checkpoint-127/scheduler.pt +3 -0
  13. checkpoint-127/special_tokens_map.json +31 -0
  14. checkpoint-127/tokenizer.json +3 -0
  15. checkpoint-127/tokenizer_config.json +208 -0
  16. checkpoint-127/trainer_state.json +946 -0
  17. checkpoint-127/training_args.bin +3 -0
  18. checkpoint-127/vocab.json +0 -0
  19. checkpoint-127/zero_to_fp32.py +760 -0
  20. checkpoint-254/added_tokens.json +24 -0
  21. checkpoint-254/config.json +28 -0
  22. checkpoint-254/generation_config.json +14 -0
  23. checkpoint-254/latest +1 -0
  24. checkpoint-254/merges.txt +0 -0
  25. checkpoint-254/model.safetensors +3 -0
  26. checkpoint-254/rng_state_0.pth +3 -0
  27. checkpoint-254/rng_state_1.pth +3 -0
  28. checkpoint-254/scheduler.pt +3 -0
  29. checkpoint-254/special_tokens_map.json +31 -0
  30. checkpoint-254/tokenizer.json +3 -0
  31. checkpoint-254/tokenizer_config.json +208 -0
  32. checkpoint-254/trainer_state.json +1859 -0
  33. checkpoint-254/training_args.bin +3 -0
  34. checkpoint-254/vocab.json +0 -0
  35. checkpoint-254/zero_to_fp32.py +760 -0
  36. checkpoint-381/added_tokens.json +24 -0
  37. checkpoint-381/config.json +28 -0
  38. checkpoint-381/generation_config.json +14 -0
  39. checkpoint-381/latest +1 -0
  40. checkpoint-381/merges.txt +0 -0
  41. checkpoint-381/model.safetensors +3 -0
  42. checkpoint-381/rng_state_0.pth +3 -0
  43. checkpoint-381/rng_state_1.pth +3 -0
  44. checkpoint-381/scheduler.pt +3 -0
  45. checkpoint-381/special_tokens_map.json +31 -0
  46. checkpoint-381/tokenizer.json +3 -0
  47. checkpoint-381/tokenizer_config.json +208 -0
  48. checkpoint-381/trainer_state.json +2772 -0
  49. checkpoint-381/training_args.bin +3 -0
  50. checkpoint-381/vocab.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-127/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-254/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-381/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - train.jsonl
9
+ model-index:
10
+ - name: outputs/out
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
18
+ <details><summary>See axolotl config</summary>
19
+
20
+ axolotl version: `0.6.0`
21
+ ```yaml
22
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
23
+ model_type: AutoModelForCausalLM
24
+ tokenizer_type: AutoTokenizer
25
+ trust_remote_code: false
26
+
27
+ load_in_8bit: false
28
+ load_in_4bit: false
29
+ strict: false
30
+
31
+ output_dir: ./outputs/out
32
+ remove_unused_columns: false
33
+
34
+ chat_template: qwen_25
35
+ # chat_template: qwen_25
36
+ datasets:
37
+ - path: train.jsonl
38
+ type: chat_template
39
+ field_messages: messages
40
+ message_field_role: role
41
+ message_field_content: content
42
+ roles:
43
+ user:
44
+ - user
45
+ assistant:
46
+ - assistant
47
+
48
+ dataset_prepared_path: mr1-sft-1
49
+ # dataset_prepared_path: ko_r1
50
+ val_set_size: 0.005
51
+ eval_sample_packing: False
52
+
53
+ sequence_len: 512
54
+ sample_packing: False
55
+ pad_to_sequence_len: False
56
+
57
+ wandb_project: mergedbench
58
+ wandb_entity:
59
+ wandb_watch:
60
+ wandb_name:
61
+ wandb_log_model:
62
+
63
+ plugins:
64
+ - axolotl.integrations.liger.LigerPlugin
65
+ liger_rope: true
66
+ liger_rms_norm: true
67
+ liger_swiglu: true
68
+ liger_fused_linear_cross_entropy: true
69
+
70
+ gradient_accumulation_steps: 1
71
+ micro_batch_size: 128
72
+ eval_batch_size: 4
73
+ num_epochs: 3
74
+ optimizer: paged_adamw_8bit
75
+ lr_scheduler: cosine
76
+ learning_rate: 2e-5
77
+
78
+ train_on_inputs: false
79
+ group_by_length: false
80
+ bf16: auto
81
+ fp16:
82
+ tf32: false
83
+
84
+ gradient_checkpointing: true
85
+ gradient_checkpointing_kwargs:
86
+ use_reentrant: false
87
+ early_stopping_patience:
88
+ resume_from_checkpoint:
89
+ logging_steps: 1
90
+ xformers_attention:
91
+ flash_attention: true
92
+
93
+ warmup_steps: 10
94
+ evals_per_epoch: 3
95
+ eval_max_new_tokens: 128
96
+ eval_table_size:
97
+ saves_per_epoch: 1
98
+ debug:
99
+ deepspeed: deepspeed_configs/zero1.json
100
+ weight_decay: 0.01
101
+ fsdp:
102
+ fsdp_config:
103
+ special_tokens:
104
+ eos_token:
105
+ ```
106
+
107
+ </details><br>
108
+
109
+ # outputs/out
110
+
111
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the train.jsonl dataset.
112
+ It achieves the following results on the evaluation set:
113
+ - Loss: 0.3103
114
+
115
+ ## Model description
116
+
117
+ More information needed
118
+
119
+ ## Intended uses & limitations
120
+
121
+ More information needed
122
+
123
+ ## Training and evaluation data
124
+
125
+ More information needed
126
+
127
+ ## Training procedure
128
+
129
+ ### Training hyperparameters
130
+
131
+ The following hyperparameters were used during training:
132
+ - learning_rate: 2e-05
133
+ - train_batch_size: 128
134
+ - eval_batch_size: 4
135
+ - seed: 42
136
+ - distributed_type: multi-GPU
137
+ - num_devices: 2
138
+ - total_train_batch_size: 256
139
+ - total_eval_batch_size: 8
140
+ - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
141
+ - lr_scheduler_type: cosine
142
+ - lr_scheduler_warmup_steps: 10
143
+ - num_epochs: 3.0
144
+
145
+ ### Training results
146
+
147
+ | Training Loss | Epoch | Step | Validation Loss |
148
+ |:-------------:|:------:|:----:|:---------------:|
149
+ | 4.6099 | 0.0079 | 1 | 3.1001 |
150
+ | 0.0071 | 0.3386 | 43 | 0.3896 |
151
+ | 0.0098 | 0.6772 | 86 | 0.3527 |
152
+ | 0.0026 | 1.0157 | 129 | 0.3306 |
153
+ | 0.0128 | 1.3543 | 172 | 0.3166 |
154
+ | 0.0042 | 1.6929 | 215 | 0.3484 |
155
+ | 0.0019 | 2.0315 | 258 | 0.2931 |
156
+ | 0.0039 | 2.3701 | 301 | 0.3032 |
157
+ | 0.0 | 2.7087 | 344 | 0.3103 |
158
+
159
+
160
+ ### Framework versions
161
+
162
+ - Transformers 4.48.1
163
+ - Pytorch 2.5.1+cu121
164
+ - Datasets 3.2.0
165
+ - Tokenizers 0.21.0
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-127/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-127/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.48.1",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151665
28
+ }
checkpoint-127/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.1"
14
+ }
checkpoint-127/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step127
checkpoint-127/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-127/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16c2b94cb0a2b46fadee1abbb18577e9280cc0f41f0a32589eba659eaeb867f6
3
+ size 3552549728
checkpoint-127/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3
3
+ size 14512
checkpoint-127/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113
3
+ size 14512
checkpoint-127/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3971f2dc488a5a415a2f023c2206a1ad3b82f9265741b7f18e7f2a8d779a4734
3
+ size 1064
checkpoint-127/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-127/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-127/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
checkpoint-127/trainer_state.json ADDED
@@ -0,0 +1,946 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 43,
6
+ "global_step": 127,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007874015748031496,
13
+ "grad_norm": 118.11203002929688,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 4.6099,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.007874015748031496,
20
+ "eval_loss": 3.1001100540161133,
21
+ "eval_runtime": 5.3966,
22
+ "eval_samples_per_second": 30.204,
23
+ "eval_steps_per_second": 3.891,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.015748031496062992,
28
+ "grad_norm": 118.4310302734375,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 4.5857,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.023622047244094488,
35
+ "grad_norm": 103.37439727783203,
36
+ "learning_rate": 6e-06,
37
+ "loss": 4.3069,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.031496062992125984,
42
+ "grad_norm": 75.05075073242188,
43
+ "learning_rate": 8.000000000000001e-06,
44
+ "loss": 3.8754,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.03937007874015748,
49
+ "grad_norm": 50.459983825683594,
50
+ "learning_rate": 1e-05,
51
+ "loss": 3.2841,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.047244094488188976,
56
+ "grad_norm": 47.4603385925293,
57
+ "learning_rate": 1.2e-05,
58
+ "loss": 2.4285,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.05511811023622047,
63
+ "grad_norm": 32.362667083740234,
64
+ "learning_rate": 1.4e-05,
65
+ "loss": 1.8177,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.06299212598425197,
70
+ "grad_norm": 22.846933364868164,
71
+ "learning_rate": 1.6000000000000003e-05,
72
+ "loss": 1.1567,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.07086614173228346,
77
+ "grad_norm": 17.060213088989258,
78
+ "learning_rate": 1.8e-05,
79
+ "loss": 0.8257,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.07874015748031496,
84
+ "grad_norm": 14.415579795837402,
85
+ "learning_rate": 2e-05,
86
+ "loss": 0.4257,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.08661417322834646,
91
+ "grad_norm": 7.753712177276611,
92
+ "learning_rate": 1.999964147509006e-05,
93
+ "loss": 0.2976,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.09448818897637795,
98
+ "grad_norm": 26.883708953857422,
99
+ "learning_rate": 1.9998565926068253e-05,
100
+ "loss": 0.3365,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.10236220472440945,
105
+ "grad_norm": 10.675631523132324,
106
+ "learning_rate": 1.9996773430056806e-05,
107
+ "loss": 0.2161,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.11023622047244094,
112
+ "grad_norm": 6.670111179351807,
113
+ "learning_rate": 1.999426411558661e-05,
114
+ "loss": 0.1816,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.11811023622047244,
119
+ "grad_norm": 8.878239631652832,
120
+ "learning_rate": 1.9991038162588018e-05,
121
+ "loss": 0.1567,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.12598425196850394,
126
+ "grad_norm": 2.9917383193969727,
127
+ "learning_rate": 1.9987095802377933e-05,
128
+ "loss": 0.0813,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.13385826771653545,
133
+ "grad_norm": 1.0548763275146484,
134
+ "learning_rate": 1.9982437317643218e-05,
135
+ "loss": 0.0217,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.14173228346456693,
140
+ "grad_norm": 2.8778488636016846,
141
+ "learning_rate": 1.9977063042420438e-05,
142
+ "loss": 0.0618,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.14960629921259844,
147
+ "grad_norm": 0.9811734557151794,
148
+ "learning_rate": 1.99709733620719e-05,
149
+ "loss": 0.0175,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.15748031496062992,
154
+ "grad_norm": 0.7218202948570251,
155
+ "learning_rate": 1.996416871325803e-05,
156
+ "loss": 0.0302,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.16535433070866143,
161
+ "grad_norm": 1.2746995687484741,
162
+ "learning_rate": 1.995664958390604e-05,
163
+ "loss": 0.0453,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.1732283464566929,
168
+ "grad_norm": 0.9413469433784485,
169
+ "learning_rate": 1.9948416513174976e-05,
170
+ "loss": 0.0175,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.18110236220472442,
175
+ "grad_norm": 1.4161137342453003,
176
+ "learning_rate": 1.9939470091417012e-05,
177
+ "loss": 0.0277,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.1889763779527559,
182
+ "grad_norm": 2.2721235752105713,
183
+ "learning_rate": 1.992981096013517e-05,
184
+ "loss": 0.0589,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.1968503937007874,
189
+ "grad_norm": 1.143970251083374,
190
+ "learning_rate": 1.9919439811937283e-05,
191
+ "loss": 0.0182,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.2047244094488189,
196
+ "grad_norm": 0.8054028749465942,
197
+ "learning_rate": 1.9908357390486342e-05,
198
+ "loss": 0.0211,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.2125984251968504,
203
+ "grad_norm": 1.4449081420898438,
204
+ "learning_rate": 1.989656449044718e-05,
205
+ "loss": 0.0244,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.2204724409448819,
210
+ "grad_norm": 0.49216631054878235,
211
+ "learning_rate": 1.988406195742948e-05,
212
+ "loss": 0.005,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.2283464566929134,
217
+ "grad_norm": 0.9945647716522217,
218
+ "learning_rate": 1.987085068792715e-05,
219
+ "loss": 0.0373,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.23622047244094488,
224
+ "grad_norm": 1.1753748655319214,
225
+ "learning_rate": 1.9856931629254032e-05,
226
+ "loss": 0.0217,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.2440944881889764,
231
+ "grad_norm": 0.5960403680801392,
232
+ "learning_rate": 1.984230577947597e-05,
233
+ "loss": 0.0157,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.25196850393700787,
238
+ "grad_norm": 0.3657272160053253,
239
+ "learning_rate": 1.9826974187339267e-05,
240
+ "loss": 0.0082,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.25984251968503935,
245
+ "grad_norm": 1.1290266513824463,
246
+ "learning_rate": 1.981093795219546e-05,
247
+ "loss": 0.0236,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.2677165354330709,
252
+ "grad_norm": 1.673962116241455,
253
+ "learning_rate": 1.9794198223922496e-05,
254
+ "loss": 0.0182,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.2755905511811024,
259
+ "grad_norm": 0.540355384349823,
260
+ "learning_rate": 1.9776756202842297e-05,
261
+ "loss": 0.011,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.28346456692913385,
266
+ "grad_norm": 0.3380790054798126,
267
+ "learning_rate": 1.9758613139634662e-05,
268
+ "loss": 0.0048,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.29133858267716534,
273
+ "grad_norm": 1.886232852935791,
274
+ "learning_rate": 1.9739770335247616e-05,
275
+ "loss": 0.0157,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.2992125984251969,
280
+ "grad_norm": 2.140639305114746,
281
+ "learning_rate": 1.972022914080411e-05,
282
+ "loss": 0.0393,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.30708661417322836,
287
+ "grad_norm": 0.35308870673179626,
288
+ "learning_rate": 1.9699990957505136e-05,
289
+ "loss": 0.0074,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.31496062992125984,
294
+ "grad_norm": 0.3918301463127136,
295
+ "learning_rate": 1.9679057236529266e-05,
296
+ "loss": 0.0083,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.3228346456692913,
301
+ "grad_norm": 0.4406338632106781,
302
+ "learning_rate": 1.965742947892858e-05,
303
+ "loss": 0.0152,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.33070866141732286,
308
+ "grad_norm": 0.6819682121276855,
309
+ "learning_rate": 1.9635109235521057e-05,
310
+ "loss": 0.0091,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.33858267716535434,
315
+ "grad_norm": 0.6794927716255188,
316
+ "learning_rate": 1.961209810677934e-05,
317
+ "loss": 0.0071,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.33858267716535434,
322
+ "eval_loss": 0.3895845115184784,
323
+ "eval_runtime": 6.5602,
324
+ "eval_samples_per_second": 24.847,
325
+ "eval_steps_per_second": 3.201,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.3464566929133858,
330
+ "grad_norm": 0.3874967694282532,
331
+ "learning_rate": 1.9588397742716004e-05,
332
+ "loss": 0.0089,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.3543307086614173,
337
+ "grad_norm": 0.5577577352523804,
338
+ "learning_rate": 1.9564009842765225e-05,
339
+ "loss": 0.0098,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.36220472440944884,
344
+ "grad_norm": 0.8152347207069397,
345
+ "learning_rate": 1.9538936155660934e-05,
346
+ "loss": 0.0118,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.3700787401574803,
351
+ "grad_norm": 0.2971118688583374,
352
+ "learning_rate": 1.951317847931141e-05,
353
+ "loss": 0.0084,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.3779527559055118,
358
+ "grad_norm": 1.0286651849746704,
359
+ "learning_rate": 1.9486738660670373e-05,
360
+ "loss": 0.0123,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.3858267716535433,
365
+ "grad_norm": 0.5227222442626953,
366
+ "learning_rate": 1.945961859560454e-05,
367
+ "loss": 0.0144,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.3937007874015748,
372
+ "grad_norm": 0.461935818195343,
373
+ "learning_rate": 1.943182022875769e-05,
374
+ "loss": 0.0119,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.4015748031496063,
379
+ "grad_norm": 1.2550626993179321,
380
+ "learning_rate": 1.940334555341122e-05,
381
+ "loss": 0.013,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.4094488188976378,
386
+ "grad_norm": 0.37549659609794617,
387
+ "learning_rate": 1.9374196611341212e-05,
388
+ "loss": 0.0181,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.41732283464566927,
393
+ "grad_norm": 0.3444191515445709,
394
+ "learning_rate": 1.9344375492672024e-05,
395
+ "loss": 0.0111,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.4251968503937008,
400
+ "grad_norm": 0.3489387333393097,
401
+ "learning_rate": 1.9313884335726443e-05,
402
+ "loss": 0.0111,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.4330708661417323,
407
+ "grad_norm": 0.26080814003944397,
408
+ "learning_rate": 1.9282725326872324e-05,
409
+ "loss": 0.0091,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.4409448818897638,
414
+ "grad_norm": 0.1390451341867447,
415
+ "learning_rate": 1.9250900700365837e-05,
416
+ "loss": 0.0033,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.44881889763779526,
421
+ "grad_norm": 0.20499111711978912,
422
+ "learning_rate": 1.921841273819125e-05,
423
+ "loss": 0.0066,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.4566929133858268,
428
+ "grad_norm": 2.185487747192383,
429
+ "learning_rate": 1.918526376989731e-05,
430
+ "loss": 0.0095,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.4645669291338583,
435
+ "grad_norm": 0.23939816653728485,
436
+ "learning_rate": 1.9151456172430186e-05,
437
+ "loss": 0.0048,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.47244094488188976,
442
+ "grad_norm": 0.41510018706321716,
443
+ "learning_rate": 1.911699236996305e-05,
444
+ "loss": 0.0077,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.48031496062992124,
449
+ "grad_norm": 0.264318585395813,
450
+ "learning_rate": 1.9081874833722234e-05,
451
+ "loss": 0.0129,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.4881889763779528,
456
+ "grad_norm": 1.0443968772888184,
457
+ "learning_rate": 1.9046106081810047e-05,
458
+ "loss": 0.0035,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.49606299212598426,
463
+ "grad_norm": 0.2800132632255554,
464
+ "learning_rate": 1.900968867902419e-05,
465
+ "loss": 0.0057,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.5039370078740157,
470
+ "grad_norm": 1.114960789680481,
471
+ "learning_rate": 1.8972625236673887e-05,
472
+ "loss": 0.0123,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.5118110236220472,
477
+ "grad_norm": 0.5027065873146057,
478
+ "learning_rate": 1.8934918412392596e-05,
479
+ "loss": 0.0052,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.5196850393700787,
484
+ "grad_norm": 0.5564169883728027,
485
+ "learning_rate": 1.8896570909947477e-05,
486
+ "loss": 0.0085,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.5275590551181102,
491
+ "grad_norm": 0.7567198872566223,
492
+ "learning_rate": 1.8857585479045493e-05,
493
+ "loss": 0.0054,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.5354330708661418,
498
+ "grad_norm": 0.13573969900608063,
499
+ "learning_rate": 1.8817964915136277e-05,
500
+ "loss": 0.0008,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.5433070866141733,
505
+ "grad_norm": 0.2704390287399292,
506
+ "learning_rate": 1.8777712059211643e-05,
507
+ "loss": 0.0078,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.5511811023622047,
512
+ "grad_norm": 0.6014392971992493,
513
+ "learning_rate": 1.8736829797601903e-05,
514
+ "loss": 0.0059,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.5590551181102362,
519
+ "grad_norm": 0.5487034916877747,
520
+ "learning_rate": 1.8695321061768886e-05,
521
+ "loss": 0.0097,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.5669291338582677,
526
+ "grad_norm": 0.6670834422111511,
527
+ "learning_rate": 1.8653188828095754e-05,
528
+ "loss": 0.011,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.5748031496062992,
533
+ "grad_norm": 0.1795203685760498,
534
+ "learning_rate": 1.8610436117673557e-05,
535
+ "loss": 0.0067,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.5826771653543307,
540
+ "grad_norm": 1.768436074256897,
541
+ "learning_rate": 1.8567065996084628e-05,
542
+ "loss": 0.0096,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.5905511811023622,
547
+ "grad_norm": 0.26233312487602234,
548
+ "learning_rate": 1.8523081573182754e-05,
549
+ "loss": 0.0124,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.5984251968503937,
554
+ "grad_norm": 0.3775719404220581,
555
+ "learning_rate": 1.847848600287019e-05,
556
+ "loss": 0.0052,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.6062992125984252,
561
+ "grad_norm": 1.0016565322875977,
562
+ "learning_rate": 1.8433282482871497e-05,
563
+ "loss": 0.0058,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.6141732283464567,
568
+ "grad_norm": 0.20153792202472687,
569
+ "learning_rate": 1.8387474254504265e-05,
570
+ "loss": 0.0056,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.6220472440944882,
575
+ "grad_norm": 0.5119822025299072,
576
+ "learning_rate": 1.8341064602446686e-05,
577
+ "loss": 0.0079,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.6299212598425197,
582
+ "grad_norm": 1.5781004428863525,
583
+ "learning_rate": 1.829405685450202e-05,
584
+ "loss": 0.008,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.6377952755905512,
589
+ "grad_norm": 0.23826757073402405,
590
+ "learning_rate": 1.824645438135999e-05,
591
+ "loss": 0.0041,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.6456692913385826,
596
+ "grad_norm": 0.6386727690696716,
597
+ "learning_rate": 1.8198260596355077e-05,
598
+ "loss": 0.0188,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.6535433070866141,
603
+ "grad_norm": 0.9503199458122253,
604
+ "learning_rate": 1.814947895522176e-05,
605
+ "loss": 0.008,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.6614173228346457,
610
+ "grad_norm": 0.2040701061487198,
611
+ "learning_rate": 1.8100112955846746e-05,
612
+ "loss": 0.0038,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.6692913385826772,
617
+ "grad_norm": 0.3660199046134949,
618
+ "learning_rate": 1.805016613801813e-05,
619
+ "loss": 0.0148,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.6771653543307087,
624
+ "grad_norm": 1.0502821207046509,
625
+ "learning_rate": 1.7999642083171576e-05,
626
+ "loss": 0.0098,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.6771653543307087,
631
+ "eval_loss": 0.3526817262172699,
632
+ "eval_runtime": 6.6167,
633
+ "eval_samples_per_second": 24.635,
634
+ "eval_steps_per_second": 3.174,
635
+ "step": 86
636
+ },
637
+ {
638
+ "epoch": 0.6850393700787402,
639
+ "grad_norm": 0.13735969364643097,
640
+ "learning_rate": 1.7948544414133534e-05,
641
+ "loss": 0.0022,
642
+ "step": 87
643
+ },
644
+ {
645
+ "epoch": 0.6929133858267716,
646
+ "grad_norm": 0.6425012946128845,
647
+ "learning_rate": 1.7896876794861443e-05,
648
+ "loss": 0.0086,
649
+ "step": 88
650
+ },
651
+ {
652
+ "epoch": 0.7007874015748031,
653
+ "grad_norm": 0.7540380954742432,
654
+ "learning_rate": 1.7844642930181008e-05,
655
+ "loss": 0.0062,
656
+ "step": 89
657
+ },
658
+ {
659
+ "epoch": 0.7086614173228346,
660
+ "grad_norm": 0.6727365255355835,
661
+ "learning_rate": 1.779184656552056e-05,
662
+ "loss": 0.0027,
663
+ "step": 90
664
+ },
665
+ {
666
+ "epoch": 0.7165354330708661,
667
+ "grad_norm": 0.14059337973594666,
668
+ "learning_rate": 1.773849148664247e-05,
669
+ "loss": 0.0056,
670
+ "step": 91
671
+ },
672
+ {
673
+ "epoch": 0.7244094488188977,
674
+ "grad_norm": 0.33292093873023987,
675
+ "learning_rate": 1.7684581519371714e-05,
676
+ "loss": 0.0047,
677
+ "step": 92
678
+ },
679
+ {
680
+ "epoch": 0.7322834645669292,
681
+ "grad_norm": 0.3809877932071686,
682
+ "learning_rate": 1.7630120529321518e-05,
683
+ "loss": 0.0139,
684
+ "step": 93
685
+ },
686
+ {
687
+ "epoch": 0.7401574803149606,
688
+ "grad_norm": 1.729589819908142,
689
+ "learning_rate": 1.7575112421616203e-05,
690
+ "loss": 0.0128,
691
+ "step": 94
692
+ },
693
+ {
694
+ "epoch": 0.7480314960629921,
695
+ "grad_norm": 0.18192608654499054,
696
+ "learning_rate": 1.751956114061113e-05,
697
+ "loss": 0.0025,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 0.7559055118110236,
702
+ "grad_norm": 1.0333118438720703,
703
+ "learning_rate": 1.7463470669609907e-05,
704
+ "loss": 0.006,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.7637795275590551,
709
+ "grad_norm": 0.7247685194015503,
710
+ "learning_rate": 1.7406845030578747e-05,
711
+ "loss": 0.0073,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.7716535433070866,
716
+ "grad_norm": 0.06979379802942276,
717
+ "learning_rate": 1.734968828385808e-05,
718
+ "loss": 0.0005,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.7795275590551181,
723
+ "grad_norm": 0.5137119293212891,
724
+ "learning_rate": 1.729200452787139e-05,
725
+ "loss": 0.0082,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.7874015748031497,
730
+ "grad_norm": 0.4704137146472931,
731
+ "learning_rate": 1.7233797898831376e-05,
732
+ "loss": 0.005,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.7952755905511811,
737
+ "grad_norm": 0.28564465045928955,
738
+ "learning_rate": 1.717507257044331e-05,
739
+ "loss": 0.0052,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.8031496062992126,
744
+ "grad_norm": 0.17685537040233612,
745
+ "learning_rate": 1.711583275360582e-05,
746
+ "loss": 0.0024,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.8110236220472441,
751
+ "grad_norm": 0.45714935660362244,
752
+ "learning_rate": 1.7056082696108896e-05,
753
+ "loss": 0.0072,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.8188976377952756,
758
+ "grad_norm": 0.4373086988925934,
759
+ "learning_rate": 1.699582668232934e-05,
760
+ "loss": 0.0051,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.8267716535433071,
765
+ "grad_norm": 0.8478983640670776,
766
+ "learning_rate": 1.6935069032923525e-05,
767
+ "loss": 0.022,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.8346456692913385,
772
+ "grad_norm": 0.16181086003780365,
773
+ "learning_rate": 1.6873814104517617e-05,
774
+ "loss": 0.0058,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.84251968503937,
779
+ "grad_norm": 0.09503592550754547,
780
+ "learning_rate": 1.6812066289395157e-05,
781
+ "loss": 0.0009,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.8503937007874016,
786
+ "grad_norm": 0.7462632060050964,
787
+ "learning_rate": 1.6749830015182106e-05,
788
+ "loss": 0.0044,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.8582677165354331,
793
+ "grad_norm": 0.07221701741218567,
794
+ "learning_rate": 1.6687109744529394e-05,
795
+ "loss": 0.0015,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.8661417322834646,
800
+ "grad_norm": 0.08999036252498627,
801
+ "learning_rate": 1.6623909974792888e-05,
802
+ "loss": 0.0023,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.8740157480314961,
807
+ "grad_norm": 0.42536938190460205,
808
+ "learning_rate": 1.656023523771095e-05,
809
+ "loss": 0.005,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.8818897637795275,
814
+ "grad_norm": 0.7885191440582275,
815
+ "learning_rate": 1.6496090099079452e-05,
816
+ "loss": 0.0103,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.889763779527559,
821
+ "grad_norm": 0.16610018908977509,
822
+ "learning_rate": 1.64314791584244e-05,
823
+ "loss": 0.006,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.8976377952755905,
828
+ "grad_norm": 0.32151034474372864,
829
+ "learning_rate": 1.6366407048672135e-05,
830
+ "loss": 0.0086,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.905511811023622,
835
+ "grad_norm": 0.557732343673706,
836
+ "learning_rate": 1.6300878435817115e-05,
837
+ "loss": 0.0064,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.9133858267716536,
842
+ "grad_norm": 0.2238176167011261,
843
+ "learning_rate": 1.6234898018587336e-05,
844
+ "loss": 0.0065,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.9212598425196851,
849
+ "grad_norm": 0.2980042099952698,
850
+ "learning_rate": 1.616847052810744e-05,
851
+ "loss": 0.0095,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.9291338582677166,
856
+ "grad_norm": 0.1529705822467804,
857
+ "learning_rate": 1.6101600727559423e-05,
858
+ "loss": 0.0062,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.937007874015748,
863
+ "grad_norm": 0.017149658873677254,
864
+ "learning_rate": 1.603429341184114e-05,
865
+ "loss": 0.0002,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.9448818897637795,
870
+ "grad_norm": 0.4514746367931366,
871
+ "learning_rate": 1.596655340722244e-05,
872
+ "loss": 0.0067,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.952755905511811,
877
+ "grad_norm": 0.11766134947538376,
878
+ "learning_rate": 1.5898385570999146e-05,
879
+ "loss": 0.0053,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.9606299212598425,
884
+ "grad_norm": 0.4089784026145935,
885
+ "learning_rate": 1.5829794791144723e-05,
886
+ "loss": 0.0085,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.968503937007874,
891
+ "grad_norm": 0.1353057473897934,
892
+ "learning_rate": 1.57607859859598e-05,
893
+ "loss": 0.0013,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.9763779527559056,
898
+ "grad_norm": 0.6548481583595276,
899
+ "learning_rate": 1.5691364103719515e-05,
900
+ "loss": 0.0117,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.984251968503937,
905
+ "grad_norm": 0.1571267992258072,
906
+ "learning_rate": 1.5621534122318682e-05,
907
+ "loss": 0.0049,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.9921259842519685,
912
+ "grad_norm": 1.2177189588546753,
913
+ "learning_rate": 1.5551301048914863e-05,
914
+ "loss": 0.0161,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 1.0,
919
+ "grad_norm": 0.414489209651947,
920
+ "learning_rate": 1.5480669919569313e-05,
921
+ "loss": 0.0181,
922
+ "step": 127
923
+ }
924
+ ],
925
+ "logging_steps": 1,
926
+ "max_steps": 381,
927
+ "num_input_tokens_seen": 0,
928
+ "num_train_epochs": 3,
929
+ "save_steps": 127,
930
+ "stateful_callbacks": {
931
+ "TrainerControl": {
932
+ "args": {
933
+ "should_epoch_stop": false,
934
+ "should_evaluate": false,
935
+ "should_log": false,
936
+ "should_save": true,
937
+ "should_training_stop": false
938
+ },
939
+ "attributes": {}
940
+ }
941
+ },
942
+ "total_flos": 1.3087271069889331e+17,
943
+ "train_batch_size": 128,
944
+ "trial_name": null,
945
+ "trial_params": null
946
+ }
checkpoint-127/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:033fc2cc0303528d4e9ad523b3fd63b75e963b86dba301044379df1d98e6c394
3
+ size 10744
checkpoint-127/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-127/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
checkpoint-254/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-254/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.48.1",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151665
28
+ }
checkpoint-254/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.1"
14
+ }
checkpoint-254/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step254
checkpoint-254/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-254/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3eedece3cb1bc71666f6a20e4c5916c10d3c11d652973d8a607631461886c5c
3
+ size 3552549728
checkpoint-254/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf
3
+ size 14512
checkpoint-254/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be
3
+ size 14512
checkpoint-254/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:997c6d2d08e07c59dc46077a3e8a02345edb321c7cf3b941c4dee43d635bb3ca
3
+ size 1064
checkpoint-254/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-254/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-254/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
checkpoint-254/trainer_state.json ADDED
@@ -0,0 +1,1859 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 43,
6
+ "global_step": 254,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007874015748031496,
13
+ "grad_norm": 118.11203002929688,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 4.6099,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.007874015748031496,
20
+ "eval_loss": 3.1001100540161133,
21
+ "eval_runtime": 5.3966,
22
+ "eval_samples_per_second": 30.204,
23
+ "eval_steps_per_second": 3.891,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.015748031496062992,
28
+ "grad_norm": 118.4310302734375,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 4.5857,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.023622047244094488,
35
+ "grad_norm": 103.37439727783203,
36
+ "learning_rate": 6e-06,
37
+ "loss": 4.3069,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.031496062992125984,
42
+ "grad_norm": 75.05075073242188,
43
+ "learning_rate": 8.000000000000001e-06,
44
+ "loss": 3.8754,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.03937007874015748,
49
+ "grad_norm": 50.459983825683594,
50
+ "learning_rate": 1e-05,
51
+ "loss": 3.2841,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.047244094488188976,
56
+ "grad_norm": 47.4603385925293,
57
+ "learning_rate": 1.2e-05,
58
+ "loss": 2.4285,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.05511811023622047,
63
+ "grad_norm": 32.362667083740234,
64
+ "learning_rate": 1.4e-05,
65
+ "loss": 1.8177,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.06299212598425197,
70
+ "grad_norm": 22.846933364868164,
71
+ "learning_rate": 1.6000000000000003e-05,
72
+ "loss": 1.1567,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.07086614173228346,
77
+ "grad_norm": 17.060213088989258,
78
+ "learning_rate": 1.8e-05,
79
+ "loss": 0.8257,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.07874015748031496,
84
+ "grad_norm": 14.415579795837402,
85
+ "learning_rate": 2e-05,
86
+ "loss": 0.4257,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.08661417322834646,
91
+ "grad_norm": 7.753712177276611,
92
+ "learning_rate": 1.999964147509006e-05,
93
+ "loss": 0.2976,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.09448818897637795,
98
+ "grad_norm": 26.883708953857422,
99
+ "learning_rate": 1.9998565926068253e-05,
100
+ "loss": 0.3365,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.10236220472440945,
105
+ "grad_norm": 10.675631523132324,
106
+ "learning_rate": 1.9996773430056806e-05,
107
+ "loss": 0.2161,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.11023622047244094,
112
+ "grad_norm": 6.670111179351807,
113
+ "learning_rate": 1.999426411558661e-05,
114
+ "loss": 0.1816,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.11811023622047244,
119
+ "grad_norm": 8.878239631652832,
120
+ "learning_rate": 1.9991038162588018e-05,
121
+ "loss": 0.1567,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.12598425196850394,
126
+ "grad_norm": 2.9917383193969727,
127
+ "learning_rate": 1.9987095802377933e-05,
128
+ "loss": 0.0813,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.13385826771653545,
133
+ "grad_norm": 1.0548763275146484,
134
+ "learning_rate": 1.9982437317643218e-05,
135
+ "loss": 0.0217,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.14173228346456693,
140
+ "grad_norm": 2.8778488636016846,
141
+ "learning_rate": 1.9977063042420438e-05,
142
+ "loss": 0.0618,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.14960629921259844,
147
+ "grad_norm": 0.9811734557151794,
148
+ "learning_rate": 1.99709733620719e-05,
149
+ "loss": 0.0175,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.15748031496062992,
154
+ "grad_norm": 0.7218202948570251,
155
+ "learning_rate": 1.996416871325803e-05,
156
+ "loss": 0.0302,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.16535433070866143,
161
+ "grad_norm": 1.2746995687484741,
162
+ "learning_rate": 1.995664958390604e-05,
163
+ "loss": 0.0453,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.1732283464566929,
168
+ "grad_norm": 0.9413469433784485,
169
+ "learning_rate": 1.9948416513174976e-05,
170
+ "loss": 0.0175,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.18110236220472442,
175
+ "grad_norm": 1.4161137342453003,
176
+ "learning_rate": 1.9939470091417012e-05,
177
+ "loss": 0.0277,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.1889763779527559,
182
+ "grad_norm": 2.2721235752105713,
183
+ "learning_rate": 1.992981096013517e-05,
184
+ "loss": 0.0589,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.1968503937007874,
189
+ "grad_norm": 1.143970251083374,
190
+ "learning_rate": 1.9919439811937283e-05,
191
+ "loss": 0.0182,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.2047244094488189,
196
+ "grad_norm": 0.8054028749465942,
197
+ "learning_rate": 1.9908357390486342e-05,
198
+ "loss": 0.0211,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.2125984251968504,
203
+ "grad_norm": 1.4449081420898438,
204
+ "learning_rate": 1.989656449044718e-05,
205
+ "loss": 0.0244,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.2204724409448819,
210
+ "grad_norm": 0.49216631054878235,
211
+ "learning_rate": 1.988406195742948e-05,
212
+ "loss": 0.005,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.2283464566929134,
217
+ "grad_norm": 0.9945647716522217,
218
+ "learning_rate": 1.987085068792715e-05,
219
+ "loss": 0.0373,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.23622047244094488,
224
+ "grad_norm": 1.1753748655319214,
225
+ "learning_rate": 1.9856931629254032e-05,
226
+ "loss": 0.0217,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.2440944881889764,
231
+ "grad_norm": 0.5960403680801392,
232
+ "learning_rate": 1.984230577947597e-05,
233
+ "loss": 0.0157,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.25196850393700787,
238
+ "grad_norm": 0.3657272160053253,
239
+ "learning_rate": 1.9826974187339267e-05,
240
+ "loss": 0.0082,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.25984251968503935,
245
+ "grad_norm": 1.1290266513824463,
246
+ "learning_rate": 1.981093795219546e-05,
247
+ "loss": 0.0236,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.2677165354330709,
252
+ "grad_norm": 1.673962116241455,
253
+ "learning_rate": 1.9794198223922496e-05,
254
+ "loss": 0.0182,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.2755905511811024,
259
+ "grad_norm": 0.540355384349823,
260
+ "learning_rate": 1.9776756202842297e-05,
261
+ "loss": 0.011,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.28346456692913385,
266
+ "grad_norm": 0.3380790054798126,
267
+ "learning_rate": 1.9758613139634662e-05,
268
+ "loss": 0.0048,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.29133858267716534,
273
+ "grad_norm": 1.886232852935791,
274
+ "learning_rate": 1.9739770335247616e-05,
275
+ "loss": 0.0157,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.2992125984251969,
280
+ "grad_norm": 2.140639305114746,
281
+ "learning_rate": 1.972022914080411e-05,
282
+ "loss": 0.0393,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.30708661417322836,
287
+ "grad_norm": 0.35308870673179626,
288
+ "learning_rate": 1.9699990957505136e-05,
289
+ "loss": 0.0074,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.31496062992125984,
294
+ "grad_norm": 0.3918301463127136,
295
+ "learning_rate": 1.9679057236529266e-05,
296
+ "loss": 0.0083,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.3228346456692913,
301
+ "grad_norm": 0.4406338632106781,
302
+ "learning_rate": 1.965742947892858e-05,
303
+ "loss": 0.0152,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.33070866141732286,
308
+ "grad_norm": 0.6819682121276855,
309
+ "learning_rate": 1.9635109235521057e-05,
310
+ "loss": 0.0091,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.33858267716535434,
315
+ "grad_norm": 0.6794927716255188,
316
+ "learning_rate": 1.961209810677934e-05,
317
+ "loss": 0.0071,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.33858267716535434,
322
+ "eval_loss": 0.3895845115184784,
323
+ "eval_runtime": 6.5602,
324
+ "eval_samples_per_second": 24.847,
325
+ "eval_steps_per_second": 3.201,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.3464566929133858,
330
+ "grad_norm": 0.3874967694282532,
331
+ "learning_rate": 1.9588397742716004e-05,
332
+ "loss": 0.0089,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.3543307086614173,
337
+ "grad_norm": 0.5577577352523804,
338
+ "learning_rate": 1.9564009842765225e-05,
339
+ "loss": 0.0098,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.36220472440944884,
344
+ "grad_norm": 0.8152347207069397,
345
+ "learning_rate": 1.9538936155660934e-05,
346
+ "loss": 0.0118,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.3700787401574803,
351
+ "grad_norm": 0.2971118688583374,
352
+ "learning_rate": 1.951317847931141e-05,
353
+ "loss": 0.0084,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.3779527559055118,
358
+ "grad_norm": 1.0286651849746704,
359
+ "learning_rate": 1.9486738660670373e-05,
360
+ "loss": 0.0123,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.3858267716535433,
365
+ "grad_norm": 0.5227222442626953,
366
+ "learning_rate": 1.945961859560454e-05,
367
+ "loss": 0.0144,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.3937007874015748,
372
+ "grad_norm": 0.461935818195343,
373
+ "learning_rate": 1.943182022875769e-05,
374
+ "loss": 0.0119,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.4015748031496063,
379
+ "grad_norm": 1.2550626993179321,
380
+ "learning_rate": 1.940334555341122e-05,
381
+ "loss": 0.013,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.4094488188976378,
386
+ "grad_norm": 0.37549659609794617,
387
+ "learning_rate": 1.9374196611341212e-05,
388
+ "loss": 0.0181,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.41732283464566927,
393
+ "grad_norm": 0.3444191515445709,
394
+ "learning_rate": 1.9344375492672024e-05,
395
+ "loss": 0.0111,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.4251968503937008,
400
+ "grad_norm": 0.3489387333393097,
401
+ "learning_rate": 1.9313884335726443e-05,
402
+ "loss": 0.0111,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.4330708661417323,
407
+ "grad_norm": 0.26080814003944397,
408
+ "learning_rate": 1.9282725326872324e-05,
409
+ "loss": 0.0091,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.4409448818897638,
414
+ "grad_norm": 0.1390451341867447,
415
+ "learning_rate": 1.9250900700365837e-05,
416
+ "loss": 0.0033,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.44881889763779526,
421
+ "grad_norm": 0.20499111711978912,
422
+ "learning_rate": 1.921841273819125e-05,
423
+ "loss": 0.0066,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.4566929133858268,
428
+ "grad_norm": 2.185487747192383,
429
+ "learning_rate": 1.918526376989731e-05,
430
+ "loss": 0.0095,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.4645669291338583,
435
+ "grad_norm": 0.23939816653728485,
436
+ "learning_rate": 1.9151456172430186e-05,
437
+ "loss": 0.0048,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.47244094488188976,
442
+ "grad_norm": 0.41510018706321716,
443
+ "learning_rate": 1.911699236996305e-05,
444
+ "loss": 0.0077,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.48031496062992124,
449
+ "grad_norm": 0.264318585395813,
450
+ "learning_rate": 1.9081874833722234e-05,
451
+ "loss": 0.0129,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.4881889763779528,
456
+ "grad_norm": 1.0443968772888184,
457
+ "learning_rate": 1.9046106081810047e-05,
458
+ "loss": 0.0035,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.49606299212598426,
463
+ "grad_norm": 0.2800132632255554,
464
+ "learning_rate": 1.900968867902419e-05,
465
+ "loss": 0.0057,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.5039370078740157,
470
+ "grad_norm": 1.114960789680481,
471
+ "learning_rate": 1.8972625236673887e-05,
472
+ "loss": 0.0123,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.5118110236220472,
477
+ "grad_norm": 0.5027065873146057,
478
+ "learning_rate": 1.8934918412392596e-05,
479
+ "loss": 0.0052,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.5196850393700787,
484
+ "grad_norm": 0.5564169883728027,
485
+ "learning_rate": 1.8896570909947477e-05,
486
+ "loss": 0.0085,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.5275590551181102,
491
+ "grad_norm": 0.7567198872566223,
492
+ "learning_rate": 1.8857585479045493e-05,
493
+ "loss": 0.0054,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.5354330708661418,
498
+ "grad_norm": 0.13573969900608063,
499
+ "learning_rate": 1.8817964915136277e-05,
500
+ "loss": 0.0008,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.5433070866141733,
505
+ "grad_norm": 0.2704390287399292,
506
+ "learning_rate": 1.8777712059211643e-05,
507
+ "loss": 0.0078,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.5511811023622047,
512
+ "grad_norm": 0.6014392971992493,
513
+ "learning_rate": 1.8736829797601903e-05,
514
+ "loss": 0.0059,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.5590551181102362,
519
+ "grad_norm": 0.5487034916877747,
520
+ "learning_rate": 1.8695321061768886e-05,
521
+ "loss": 0.0097,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.5669291338582677,
526
+ "grad_norm": 0.6670834422111511,
527
+ "learning_rate": 1.8653188828095754e-05,
528
+ "loss": 0.011,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.5748031496062992,
533
+ "grad_norm": 0.1795203685760498,
534
+ "learning_rate": 1.8610436117673557e-05,
535
+ "loss": 0.0067,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.5826771653543307,
540
+ "grad_norm": 1.768436074256897,
541
+ "learning_rate": 1.8567065996084628e-05,
542
+ "loss": 0.0096,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.5905511811023622,
547
+ "grad_norm": 0.26233312487602234,
548
+ "learning_rate": 1.8523081573182754e-05,
549
+ "loss": 0.0124,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.5984251968503937,
554
+ "grad_norm": 0.3775719404220581,
555
+ "learning_rate": 1.847848600287019e-05,
556
+ "loss": 0.0052,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.6062992125984252,
561
+ "grad_norm": 1.0016565322875977,
562
+ "learning_rate": 1.8433282482871497e-05,
563
+ "loss": 0.0058,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.6141732283464567,
568
+ "grad_norm": 0.20153792202472687,
569
+ "learning_rate": 1.8387474254504265e-05,
570
+ "loss": 0.0056,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.6220472440944882,
575
+ "grad_norm": 0.5119822025299072,
576
+ "learning_rate": 1.8341064602446686e-05,
577
+ "loss": 0.0079,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.6299212598425197,
582
+ "grad_norm": 1.5781004428863525,
583
+ "learning_rate": 1.829405685450202e-05,
584
+ "loss": 0.008,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.6377952755905512,
589
+ "grad_norm": 0.23826757073402405,
590
+ "learning_rate": 1.824645438135999e-05,
591
+ "loss": 0.0041,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.6456692913385826,
596
+ "grad_norm": 0.6386727690696716,
597
+ "learning_rate": 1.8198260596355077e-05,
598
+ "loss": 0.0188,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.6535433070866141,
603
+ "grad_norm": 0.9503199458122253,
604
+ "learning_rate": 1.814947895522176e-05,
605
+ "loss": 0.008,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.6614173228346457,
610
+ "grad_norm": 0.2040701061487198,
611
+ "learning_rate": 1.8100112955846746e-05,
612
+ "loss": 0.0038,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.6692913385826772,
617
+ "grad_norm": 0.3660199046134949,
618
+ "learning_rate": 1.805016613801813e-05,
619
+ "loss": 0.0148,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.6771653543307087,
624
+ "grad_norm": 1.0502821207046509,
625
+ "learning_rate": 1.7999642083171576e-05,
626
+ "loss": 0.0098,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.6771653543307087,
631
+ "eval_loss": 0.3526817262172699,
632
+ "eval_runtime": 6.6167,
633
+ "eval_samples_per_second": 24.635,
634
+ "eval_steps_per_second": 3.174,
635
+ "step": 86
636
+ },
637
+ {
638
+ "epoch": 0.6850393700787402,
639
+ "grad_norm": 0.13735969364643097,
640
+ "learning_rate": 1.7948544414133534e-05,
641
+ "loss": 0.0022,
642
+ "step": 87
643
+ },
644
+ {
645
+ "epoch": 0.6929133858267716,
646
+ "grad_norm": 0.6425012946128845,
647
+ "learning_rate": 1.7896876794861443e-05,
648
+ "loss": 0.0086,
649
+ "step": 88
650
+ },
651
+ {
652
+ "epoch": 0.7007874015748031,
653
+ "grad_norm": 0.7540380954742432,
654
+ "learning_rate": 1.7844642930181008e-05,
655
+ "loss": 0.0062,
656
+ "step": 89
657
+ },
658
+ {
659
+ "epoch": 0.7086614173228346,
660
+ "grad_norm": 0.6727365255355835,
661
+ "learning_rate": 1.779184656552056e-05,
662
+ "loss": 0.0027,
663
+ "step": 90
664
+ },
665
+ {
666
+ "epoch": 0.7165354330708661,
667
+ "grad_norm": 0.14059337973594666,
668
+ "learning_rate": 1.773849148664247e-05,
669
+ "loss": 0.0056,
670
+ "step": 91
671
+ },
672
+ {
673
+ "epoch": 0.7244094488188977,
674
+ "grad_norm": 0.33292093873023987,
675
+ "learning_rate": 1.7684581519371714e-05,
676
+ "loss": 0.0047,
677
+ "step": 92
678
+ },
679
+ {
680
+ "epoch": 0.7322834645669292,
681
+ "grad_norm": 0.3809877932071686,
682
+ "learning_rate": 1.7630120529321518e-05,
683
+ "loss": 0.0139,
684
+ "step": 93
685
+ },
686
+ {
687
+ "epoch": 0.7401574803149606,
688
+ "grad_norm": 1.729589819908142,
689
+ "learning_rate": 1.7575112421616203e-05,
690
+ "loss": 0.0128,
691
+ "step": 94
692
+ },
693
+ {
694
+ "epoch": 0.7480314960629921,
695
+ "grad_norm": 0.18192608654499054,
696
+ "learning_rate": 1.751956114061113e-05,
697
+ "loss": 0.0025,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 0.7559055118110236,
702
+ "grad_norm": 1.0333118438720703,
703
+ "learning_rate": 1.7463470669609907e-05,
704
+ "loss": 0.006,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.7637795275590551,
709
+ "grad_norm": 0.7247685194015503,
710
+ "learning_rate": 1.7406845030578747e-05,
711
+ "loss": 0.0073,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.7716535433070866,
716
+ "grad_norm": 0.06979379802942276,
717
+ "learning_rate": 1.734968828385808e-05,
718
+ "loss": 0.0005,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.7795275590551181,
723
+ "grad_norm": 0.5137119293212891,
724
+ "learning_rate": 1.729200452787139e-05,
725
+ "loss": 0.0082,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.7874015748031497,
730
+ "grad_norm": 0.4704137146472931,
731
+ "learning_rate": 1.7233797898831376e-05,
732
+ "loss": 0.005,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.7952755905511811,
737
+ "grad_norm": 0.28564465045928955,
738
+ "learning_rate": 1.717507257044331e-05,
739
+ "loss": 0.0052,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.8031496062992126,
744
+ "grad_norm": 0.17685537040233612,
745
+ "learning_rate": 1.711583275360582e-05,
746
+ "loss": 0.0024,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.8110236220472441,
751
+ "grad_norm": 0.45714935660362244,
752
+ "learning_rate": 1.7056082696108896e-05,
753
+ "loss": 0.0072,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.8188976377952756,
758
+ "grad_norm": 0.4373086988925934,
759
+ "learning_rate": 1.699582668232934e-05,
760
+ "loss": 0.0051,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.8267716535433071,
765
+ "grad_norm": 0.8478983640670776,
766
+ "learning_rate": 1.6935069032923525e-05,
767
+ "loss": 0.022,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.8346456692913385,
772
+ "grad_norm": 0.16181086003780365,
773
+ "learning_rate": 1.6873814104517617e-05,
774
+ "loss": 0.0058,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.84251968503937,
779
+ "grad_norm": 0.09503592550754547,
780
+ "learning_rate": 1.6812066289395157e-05,
781
+ "loss": 0.0009,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.8503937007874016,
786
+ "grad_norm": 0.7462632060050964,
787
+ "learning_rate": 1.6749830015182106e-05,
788
+ "loss": 0.0044,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.8582677165354331,
793
+ "grad_norm": 0.07221701741218567,
794
+ "learning_rate": 1.6687109744529394e-05,
795
+ "loss": 0.0015,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.8661417322834646,
800
+ "grad_norm": 0.08999036252498627,
801
+ "learning_rate": 1.6623909974792888e-05,
802
+ "loss": 0.0023,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.8740157480314961,
807
+ "grad_norm": 0.42536938190460205,
808
+ "learning_rate": 1.656023523771095e-05,
809
+ "loss": 0.005,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.8818897637795275,
814
+ "grad_norm": 0.7885191440582275,
815
+ "learning_rate": 1.6496090099079452e-05,
816
+ "loss": 0.0103,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.889763779527559,
821
+ "grad_norm": 0.16610018908977509,
822
+ "learning_rate": 1.64314791584244e-05,
823
+ "loss": 0.006,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.8976377952755905,
828
+ "grad_norm": 0.32151034474372864,
829
+ "learning_rate": 1.6366407048672135e-05,
830
+ "loss": 0.0086,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.905511811023622,
835
+ "grad_norm": 0.557732343673706,
836
+ "learning_rate": 1.6300878435817115e-05,
837
+ "loss": 0.0064,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.9133858267716536,
842
+ "grad_norm": 0.2238176167011261,
843
+ "learning_rate": 1.6234898018587336e-05,
844
+ "loss": 0.0065,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.9212598425196851,
849
+ "grad_norm": 0.2980042099952698,
850
+ "learning_rate": 1.616847052810744e-05,
851
+ "loss": 0.0095,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.9291338582677166,
856
+ "grad_norm": 0.1529705822467804,
857
+ "learning_rate": 1.6101600727559423e-05,
858
+ "loss": 0.0062,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.937007874015748,
863
+ "grad_norm": 0.017149658873677254,
864
+ "learning_rate": 1.603429341184114e-05,
865
+ "loss": 0.0002,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.9448818897637795,
870
+ "grad_norm": 0.4514746367931366,
871
+ "learning_rate": 1.596655340722244e-05,
872
+ "loss": 0.0067,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.952755905511811,
877
+ "grad_norm": 0.11766134947538376,
878
+ "learning_rate": 1.5898385570999146e-05,
879
+ "loss": 0.0053,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.9606299212598425,
884
+ "grad_norm": 0.4089784026145935,
885
+ "learning_rate": 1.5829794791144723e-05,
886
+ "loss": 0.0085,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.968503937007874,
891
+ "grad_norm": 0.1353057473897934,
892
+ "learning_rate": 1.57607859859598e-05,
893
+ "loss": 0.0013,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.9763779527559056,
898
+ "grad_norm": 0.6548481583595276,
899
+ "learning_rate": 1.5691364103719515e-05,
900
+ "loss": 0.0117,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.984251968503937,
905
+ "grad_norm": 0.1571267992258072,
906
+ "learning_rate": 1.5621534122318682e-05,
907
+ "loss": 0.0049,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.9921259842519685,
912
+ "grad_norm": 1.2177189588546753,
913
+ "learning_rate": 1.5551301048914863e-05,
914
+ "loss": 0.0161,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 1.0,
919
+ "grad_norm": 0.414489209651947,
920
+ "learning_rate": 1.5480669919569313e-05,
921
+ "loss": 0.0181,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 1.0078740157480315,
926
+ "grad_norm": 0.10985995829105377,
927
+ "learning_rate": 1.54096457988859e-05,
928
+ "loss": 0.0049,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 1.015748031496063,
933
+ "grad_norm": 0.12780147790908813,
934
+ "learning_rate": 1.533823377964791e-05,
935
+ "loss": 0.0026,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 1.015748031496063,
940
+ "eval_loss": 0.33064374327659607,
941
+ "eval_runtime": 6.9286,
942
+ "eval_samples_per_second": 23.526,
943
+ "eval_steps_per_second": 3.031,
944
+ "step": 129
945
+ },
946
+ {
947
+ "epoch": 1.0236220472440944,
948
+ "grad_norm": 0.5142458081245422,
949
+ "learning_rate": 1.52664389824529e-05,
950
+ "loss": 0.0082,
951
+ "step": 130
952
+ },
953
+ {
954
+ "epoch": 1.031496062992126,
955
+ "grad_norm": 0.15617145597934723,
956
+ "learning_rate": 1.5194266555345505e-05,
957
+ "loss": 0.0016,
958
+ "step": 131
959
+ },
960
+ {
961
+ "epoch": 1.0393700787401574,
962
+ "grad_norm": 0.5782387852668762,
963
+ "learning_rate": 1.5121721673448319e-05,
964
+ "loss": 0.0117,
965
+ "step": 132
966
+ },
967
+ {
968
+ "epoch": 1.047244094488189,
969
+ "grad_norm": 0.08414836972951889,
970
+ "learning_rate": 1.5048809538590789e-05,
971
+ "loss": 0.0021,
972
+ "step": 133
973
+ },
974
+ {
975
+ "epoch": 1.0551181102362204,
976
+ "grad_norm": 0.28253939747810364,
977
+ "learning_rate": 1.4975535378936228e-05,
978
+ "loss": 0.0055,
979
+ "step": 134
980
+ },
981
+ {
982
+ "epoch": 1.0629921259842519,
983
+ "grad_norm": 0.47917842864990234,
984
+ "learning_rate": 1.490190444860694e-05,
985
+ "loss": 0.0046,
986
+ "step": 135
987
+ },
988
+ {
989
+ "epoch": 1.0708661417322836,
990
+ "grad_norm": 0.1895662248134613,
991
+ "learning_rate": 1.482792202730745e-05,
992
+ "loss": 0.006,
993
+ "step": 136
994
+ },
995
+ {
996
+ "epoch": 1.078740157480315,
997
+ "grad_norm": 0.13722768425941467,
998
+ "learning_rate": 1.475359341994595e-05,
999
+ "loss": 0.0031,
1000
+ "step": 137
1001
+ },
1002
+ {
1003
+ "epoch": 1.0866141732283465,
1004
+ "grad_norm": 0.10731153190135956,
1005
+ "learning_rate": 1.4678923956253894e-05,
1006
+ "loss": 0.0005,
1007
+ "step": 138
1008
+ },
1009
+ {
1010
+ "epoch": 1.094488188976378,
1011
+ "grad_norm": 0.12261265516281128,
1012
+ "learning_rate": 1.460391899040383e-05,
1013
+ "loss": 0.0031,
1014
+ "step": 139
1015
+ },
1016
+ {
1017
+ "epoch": 1.1023622047244095,
1018
+ "grad_norm": 0.0038245893083512783,
1019
+ "learning_rate": 1.4528583900625481e-05,
1020
+ "loss": 0.0,
1021
+ "step": 140
1022
+ },
1023
+ {
1024
+ "epoch": 1.110236220472441,
1025
+ "grad_norm": 0.28762558102607727,
1026
+ "learning_rate": 1.4452924088820101e-05,
1027
+ "loss": 0.004,
1028
+ "step": 141
1029
+ },
1030
+ {
1031
+ "epoch": 1.1181102362204725,
1032
+ "grad_norm": 0.17267552018165588,
1033
+ "learning_rate": 1.4376944980173138e-05,
1034
+ "loss": 0.0002,
1035
+ "step": 142
1036
+ },
1037
+ {
1038
+ "epoch": 1.125984251968504,
1039
+ "grad_norm": 0.12727122008800507,
1040
+ "learning_rate": 1.4300652022765207e-05,
1041
+ "loss": 0.0029,
1042
+ "step": 143
1043
+ },
1044
+ {
1045
+ "epoch": 1.1338582677165354,
1046
+ "grad_norm": 0.25049135088920593,
1047
+ "learning_rate": 1.4224050687181442e-05,
1048
+ "loss": 0.0108,
1049
+ "step": 144
1050
+ },
1051
+ {
1052
+ "epoch": 1.141732283464567,
1053
+ "grad_norm": 0.16092728078365326,
1054
+ "learning_rate": 1.4147146466119235e-05,
1055
+ "loss": 0.0024,
1056
+ "step": 145
1057
+ },
1058
+ {
1059
+ "epoch": 1.1496062992125984,
1060
+ "grad_norm": 0.13642658293247223,
1061
+ "learning_rate": 1.406994487399437e-05,
1062
+ "loss": 0.0037,
1063
+ "step": 146
1064
+ },
1065
+ {
1066
+ "epoch": 1.1574803149606299,
1067
+ "grad_norm": 0.9029403328895569,
1068
+ "learning_rate": 1.3992451446545624e-05,
1069
+ "loss": 0.0034,
1070
+ "step": 147
1071
+ },
1072
+ {
1073
+ "epoch": 1.1653543307086613,
1074
+ "grad_norm": 0.19518424570560455,
1075
+ "learning_rate": 1.3914671740437811e-05,
1076
+ "loss": 0.0057,
1077
+ "step": 148
1078
+ },
1079
+ {
1080
+ "epoch": 1.1732283464566928,
1081
+ "grad_norm": 0.12140502035617828,
1082
+ "learning_rate": 1.3836611332863356e-05,
1083
+ "loss": 0.0041,
1084
+ "step": 149
1085
+ },
1086
+ {
1087
+ "epoch": 1.1811023622047245,
1088
+ "grad_norm": 0.5148038864135742,
1089
+ "learning_rate": 1.3758275821142382e-05,
1090
+ "loss": 0.0026,
1091
+ "step": 150
1092
+ },
1093
+ {
1094
+ "epoch": 1.188976377952756,
1095
+ "grad_norm": 1.828904390335083,
1096
+ "learning_rate": 1.3679670822321347e-05,
1097
+ "loss": 0.0024,
1098
+ "step": 151
1099
+ },
1100
+ {
1101
+ "epoch": 1.1968503937007875,
1102
+ "grad_norm": 0.3571717143058777,
1103
+ "learning_rate": 1.3600801972770272e-05,
1104
+ "loss": 0.0106,
1105
+ "step": 152
1106
+ },
1107
+ {
1108
+ "epoch": 1.204724409448819,
1109
+ "grad_norm": 0.051027003675699234,
1110
+ "learning_rate": 1.3521674927778594e-05,
1111
+ "loss": 0.0003,
1112
+ "step": 153
1113
+ },
1114
+ {
1115
+ "epoch": 1.2125984251968505,
1116
+ "grad_norm": 0.6490982174873352,
1117
+ "learning_rate": 1.3442295361149651e-05,
1118
+ "loss": 0.0035,
1119
+ "step": 154
1120
+ },
1121
+ {
1122
+ "epoch": 1.220472440944882,
1123
+ "grad_norm": 0.08408445864915848,
1124
+ "learning_rate": 1.336266896479384e-05,
1125
+ "loss": 0.0027,
1126
+ "step": 155
1127
+ },
1128
+ {
1129
+ "epoch": 1.2283464566929134,
1130
+ "grad_norm": 0.09666562080383301,
1131
+ "learning_rate": 1.328280144832047e-05,
1132
+ "loss": 0.0019,
1133
+ "step": 156
1134
+ },
1135
+ {
1136
+ "epoch": 1.236220472440945,
1137
+ "grad_norm": 0.03880690038204193,
1138
+ "learning_rate": 1.3202698538628376e-05,
1139
+ "loss": 0.0003,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 1.2440944881889764,
1144
+ "grad_norm": 0.11940775066614151,
1145
+ "learning_rate": 1.3122365979495259e-05,
1146
+ "loss": 0.0024,
1147
+ "step": 158
1148
+ },
1149
+ {
1150
+ "epoch": 1.2519685039370079,
1151
+ "grad_norm": 0.1442880481481552,
1152
+ "learning_rate": 1.3041809531165819e-05,
1153
+ "loss": 0.0015,
1154
+ "step": 159
1155
+ },
1156
+ {
1157
+ "epoch": 1.2598425196850394,
1158
+ "grad_norm": 0.1961939036846161,
1159
+ "learning_rate": 1.2961034969938732e-05,
1160
+ "loss": 0.0056,
1161
+ "step": 160
1162
+ },
1163
+ {
1164
+ "epoch": 1.2677165354330708,
1165
+ "grad_norm": 0.26947638392448425,
1166
+ "learning_rate": 1.288004808775246e-05,
1167
+ "loss": 0.0028,
1168
+ "step": 161
1169
+ },
1170
+ {
1171
+ "epoch": 1.2755905511811023,
1172
+ "grad_norm": 0.5154056549072266,
1173
+ "learning_rate": 1.2798854691769927e-05,
1174
+ "loss": 0.0037,
1175
+ "step": 162
1176
+ },
1177
+ {
1178
+ "epoch": 1.2834645669291338,
1179
+ "grad_norm": 0.4292369782924652,
1180
+ "learning_rate": 1.2717460603962132e-05,
1181
+ "loss": 0.0029,
1182
+ "step": 163
1183
+ },
1184
+ {
1185
+ "epoch": 1.2913385826771653,
1186
+ "grad_norm": 0.19139212369918823,
1187
+ "learning_rate": 1.2635871660690677e-05,
1188
+ "loss": 0.0061,
1189
+ "step": 164
1190
+ },
1191
+ {
1192
+ "epoch": 1.2992125984251968,
1193
+ "grad_norm": 0.19960306584835052,
1194
+ "learning_rate": 1.2554093712289267e-05,
1195
+ "loss": 0.005,
1196
+ "step": 165
1197
+ },
1198
+ {
1199
+ "epoch": 1.3070866141732282,
1200
+ "grad_norm": 0.4523830711841583,
1201
+ "learning_rate": 1.2472132622644222e-05,
1202
+ "loss": 0.0065,
1203
+ "step": 166
1204
+ },
1205
+ {
1206
+ "epoch": 1.3149606299212597,
1207
+ "grad_norm": 0.49343299865722656,
1208
+ "learning_rate": 1.2389994268773995e-05,
1209
+ "loss": 0.0061,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 1.3228346456692912,
1214
+ "grad_norm": 0.01938088797032833,
1215
+ "learning_rate": 1.2307684540407775e-05,
1216
+ "loss": 0.0001,
1217
+ "step": 168
1218
+ },
1219
+ {
1220
+ "epoch": 1.330708661417323,
1221
+ "grad_norm": 0.3082112669944763,
1222
+ "learning_rate": 1.2225209339563144e-05,
1223
+ "loss": 0.0053,
1224
+ "step": 169
1225
+ },
1226
+ {
1227
+ "epoch": 1.3385826771653544,
1228
+ "grad_norm": 0.01982509344816208,
1229
+ "learning_rate": 1.2142574580122903e-05,
1230
+ "loss": 0.0001,
1231
+ "step": 170
1232
+ },
1233
+ {
1234
+ "epoch": 1.3464566929133859,
1235
+ "grad_norm": 0.12388588488101959,
1236
+ "learning_rate": 1.2059786187410984e-05,
1237
+ "loss": 0.0049,
1238
+ "step": 171
1239
+ },
1240
+ {
1241
+ "epoch": 1.3543307086614174,
1242
+ "grad_norm": 0.43759095668792725,
1243
+ "learning_rate": 1.1976850097767598e-05,
1244
+ "loss": 0.0128,
1245
+ "step": 172
1246
+ },
1247
+ {
1248
+ "epoch": 1.3543307086614174,
1249
+ "eval_loss": 0.3166251480579376,
1250
+ "eval_runtime": 6.9515,
1251
+ "eval_samples_per_second": 23.448,
1252
+ "eval_steps_per_second": 3.021,
1253
+ "step": 172
1254
+ },
1255
+ {
1256
+ "epoch": 1.3622047244094488,
1257
+ "grad_norm": 0.46561670303344727,
1258
+ "learning_rate": 1.1893772258123554e-05,
1259
+ "loss": 0.008,
1260
+ "step": 173
1261
+ },
1262
+ {
1263
+ "epoch": 1.3700787401574803,
1264
+ "grad_norm": 0.16612188518047333,
1265
+ "learning_rate": 1.1810558625573856e-05,
1266
+ "loss": 0.0024,
1267
+ "step": 174
1268
+ },
1269
+ {
1270
+ "epoch": 1.3779527559055118,
1271
+ "grad_norm": 0.13628093898296356,
1272
+ "learning_rate": 1.1727215166950519e-05,
1273
+ "loss": 0.0045,
1274
+ "step": 175
1275
+ },
1276
+ {
1277
+ "epoch": 1.3858267716535433,
1278
+ "grad_norm": 0.565229058265686,
1279
+ "learning_rate": 1.1643747858394743e-05,
1280
+ "loss": 0.0103,
1281
+ "step": 176
1282
+ },
1283
+ {
1284
+ "epoch": 1.3937007874015748,
1285
+ "grad_norm": 0.14550763368606567,
1286
+ "learning_rate": 1.156016268492839e-05,
1287
+ "loss": 0.0028,
1288
+ "step": 177
1289
+ },
1290
+ {
1291
+ "epoch": 1.4015748031496063,
1292
+ "grad_norm": 0.12460129708051682,
1293
+ "learning_rate": 1.1476465640024814e-05,
1294
+ "loss": 0.0031,
1295
+ "step": 178
1296
+ },
1297
+ {
1298
+ "epoch": 1.4094488188976377,
1299
+ "grad_norm": 0.19089221954345703,
1300
+ "learning_rate": 1.1392662725179114e-05,
1301
+ "loss": 0.0035,
1302
+ "step": 179
1303
+ },
1304
+ {
1305
+ "epoch": 1.4173228346456692,
1306
+ "grad_norm": 0.6106573343276978,
1307
+ "learning_rate": 1.1308759949477786e-05,
1308
+ "loss": 0.0088,
1309
+ "step": 180
1310
+ },
1311
+ {
1312
+ "epoch": 1.425196850393701,
1313
+ "grad_norm": 0.20053207874298096,
1314
+ "learning_rate": 1.1224763329167859e-05,
1315
+ "loss": 0.0033,
1316
+ "step": 181
1317
+ },
1318
+ {
1319
+ "epoch": 1.4330708661417324,
1320
+ "grad_norm": 0.1984691321849823,
1321
+ "learning_rate": 1.1140678887225468e-05,
1322
+ "loss": 0.0051,
1323
+ "step": 182
1324
+ },
1325
+ {
1326
+ "epoch": 1.4409448818897639,
1327
+ "grad_norm": 0.19264858961105347,
1328
+ "learning_rate": 1.1056512652924014e-05,
1329
+ "loss": 0.0046,
1330
+ "step": 183
1331
+ },
1332
+ {
1333
+ "epoch": 1.4488188976377954,
1334
+ "grad_norm": 0.10979076474905014,
1335
+ "learning_rate": 1.0972270661401812e-05,
1336
+ "loss": 0.0031,
1337
+ "step": 184
1338
+ },
1339
+ {
1340
+ "epoch": 1.4566929133858268,
1341
+ "grad_norm": 0.1744084656238556,
1342
+ "learning_rate": 1.0887958953229349e-05,
1343
+ "loss": 0.0024,
1344
+ "step": 185
1345
+ },
1346
+ {
1347
+ "epoch": 1.4645669291338583,
1348
+ "grad_norm": 0.20646224915981293,
1349
+ "learning_rate": 1.0803583573976137e-05,
1350
+ "loss": 0.008,
1351
+ "step": 186
1352
+ },
1353
+ {
1354
+ "epoch": 1.4724409448818898,
1355
+ "grad_norm": 0.14391584694385529,
1356
+ "learning_rate": 1.0719150573777226e-05,
1357
+ "loss": 0.004,
1358
+ "step": 187
1359
+ },
1360
+ {
1361
+ "epoch": 1.4803149606299213,
1362
+ "grad_norm": 0.36887863278388977,
1363
+ "learning_rate": 1.0634666006899375e-05,
1364
+ "loss": 0.0074,
1365
+ "step": 188
1366
+ },
1367
+ {
1368
+ "epoch": 1.4881889763779528,
1369
+ "grad_norm": 0.21352627873420715,
1370
+ "learning_rate": 1.055013593130693e-05,
1371
+ "loss": 0.0082,
1372
+ "step": 189
1373
+ },
1374
+ {
1375
+ "epoch": 1.4960629921259843,
1376
+ "grad_norm": 0.22443020343780518,
1377
+ "learning_rate": 1.046556640822744e-05,
1378
+ "loss": 0.0087,
1379
+ "step": 190
1380
+ },
1381
+ {
1382
+ "epoch": 1.5039370078740157,
1383
+ "grad_norm": 0.4243764281272888,
1384
+ "learning_rate": 1.0380963501717034e-05,
1385
+ "loss": 0.0068,
1386
+ "step": 191
1387
+ },
1388
+ {
1389
+ "epoch": 1.5118110236220472,
1390
+ "grad_norm": 0.17558562755584717,
1391
+ "learning_rate": 1.0296333278225599e-05,
1392
+ "loss": 0.0054,
1393
+ "step": 192
1394
+ },
1395
+ {
1396
+ "epoch": 1.5196850393700787,
1397
+ "grad_norm": 0.14842620491981506,
1398
+ "learning_rate": 1.0211681806161787e-05,
1399
+ "loss": 0.0031,
1400
+ "step": 193
1401
+ },
1402
+ {
1403
+ "epoch": 1.5275590551181102,
1404
+ "grad_norm": 0.09316081553697586,
1405
+ "learning_rate": 1.0127015155457875e-05,
1406
+ "loss": 0.0013,
1407
+ "step": 194
1408
+ },
1409
+ {
1410
+ "epoch": 1.5354330708661417,
1411
+ "grad_norm": 0.19795025885105133,
1412
+ "learning_rate": 1.0042339397134528e-05,
1413
+ "loss": 0.0051,
1414
+ "step": 195
1415
+ },
1416
+ {
1417
+ "epoch": 1.5433070866141732,
1418
+ "grad_norm": 0.21606990694999695,
1419
+ "learning_rate": 9.957660602865477e-06,
1420
+ "loss": 0.0041,
1421
+ "step": 196
1422
+ },
1423
+ {
1424
+ "epoch": 1.5511811023622046,
1425
+ "grad_norm": 0.18036173284053802,
1426
+ "learning_rate": 9.872984844542128e-06,
1427
+ "loss": 0.0037,
1428
+ "step": 197
1429
+ },
1430
+ {
1431
+ "epoch": 1.5590551181102361,
1432
+ "grad_norm": 0.18953870236873627,
1433
+ "learning_rate": 9.788318193838218e-06,
1434
+ "loss": 0.0041,
1435
+ "step": 198
1436
+ },
1437
+ {
1438
+ "epoch": 1.5669291338582676,
1439
+ "grad_norm": 0.12346503138542175,
1440
+ "learning_rate": 9.703666721774403e-06,
1441
+ "loss": 0.0035,
1442
+ "step": 199
1443
+ },
1444
+ {
1445
+ "epoch": 1.574803149606299,
1446
+ "grad_norm": 0.4576225280761719,
1447
+ "learning_rate": 9.619036498282968e-06,
1448
+ "loss": 0.0041,
1449
+ "step": 200
1450
+ },
1451
+ {
1452
+ "epoch": 1.5826771653543306,
1453
+ "grad_norm": 0.10333681106567383,
1454
+ "learning_rate": 9.534433591772562e-06,
1455
+ "loss": 0.0011,
1456
+ "step": 201
1457
+ },
1458
+ {
1459
+ "epoch": 1.590551181102362,
1460
+ "grad_norm": 0.19167865812778473,
1461
+ "learning_rate": 9.449864068693072e-06,
1462
+ "loss": 0.0062,
1463
+ "step": 202
1464
+ },
1465
+ {
1466
+ "epoch": 1.5984251968503937,
1467
+ "grad_norm": 0.2258184254169464,
1468
+ "learning_rate": 9.365333993100628e-06,
1469
+ "loss": 0.003,
1470
+ "step": 203
1471
+ },
1472
+ {
1473
+ "epoch": 1.6062992125984252,
1474
+ "grad_norm": 0.07945302873849869,
1475
+ "learning_rate": 9.280849426222778e-06,
1476
+ "loss": 0.0008,
1477
+ "step": 204
1478
+ },
1479
+ {
1480
+ "epoch": 1.6141732283464567,
1481
+ "grad_norm": 0.17767398059368134,
1482
+ "learning_rate": 9.196416426023868e-06,
1483
+ "loss": 0.0053,
1484
+ "step": 205
1485
+ },
1486
+ {
1487
+ "epoch": 1.6220472440944882,
1488
+ "grad_norm": 0.12704500555992126,
1489
+ "learning_rate": 9.112041046770653e-06,
1490
+ "loss": 0.0023,
1491
+ "step": 206
1492
+ },
1493
+ {
1494
+ "epoch": 1.6299212598425197,
1495
+ "grad_norm": 0.4054742753505707,
1496
+ "learning_rate": 9.027729338598188e-06,
1497
+ "loss": 0.0045,
1498
+ "step": 207
1499
+ },
1500
+ {
1501
+ "epoch": 1.6377952755905512,
1502
+ "grad_norm": 0.4463757574558258,
1503
+ "learning_rate": 8.943487347075988e-06,
1504
+ "loss": 0.007,
1505
+ "step": 208
1506
+ },
1507
+ {
1508
+ "epoch": 1.6456692913385826,
1509
+ "grad_norm": 0.6517045497894287,
1510
+ "learning_rate": 8.859321112774535e-06,
1511
+ "loss": 0.0052,
1512
+ "step": 209
1513
+ },
1514
+ {
1515
+ "epoch": 1.6535433070866141,
1516
+ "grad_norm": 0.1542089730501175,
1517
+ "learning_rate": 8.775236670832146e-06,
1518
+ "loss": 0.0047,
1519
+ "step": 210
1520
+ },
1521
+ {
1522
+ "epoch": 1.6614173228346458,
1523
+ "grad_norm": 0.14716440439224243,
1524
+ "learning_rate": 8.691240050522215e-06,
1525
+ "loss": 0.0049,
1526
+ "step": 211
1527
+ },
1528
+ {
1529
+ "epoch": 1.6692913385826773,
1530
+ "grad_norm": 0.2997347116470337,
1531
+ "learning_rate": 8.607337274820888e-06,
1532
+ "loss": 0.0076,
1533
+ "step": 212
1534
+ },
1535
+ {
1536
+ "epoch": 1.6771653543307088,
1537
+ "grad_norm": 0.22548256814479828,
1538
+ "learning_rate": 8.52353435997519e-06,
1539
+ "loss": 0.0063,
1540
+ "step": 213
1541
+ },
1542
+ {
1543
+ "epoch": 1.6850393700787403,
1544
+ "grad_norm": 0.7220733165740967,
1545
+ "learning_rate": 8.439837315071612e-06,
1546
+ "loss": 0.0089,
1547
+ "step": 214
1548
+ },
1549
+ {
1550
+ "epoch": 1.6929133858267718,
1551
+ "grad_norm": 0.5101618766784668,
1552
+ "learning_rate": 8.35625214160526e-06,
1553
+ "loss": 0.0042,
1554
+ "step": 215
1555
+ },
1556
+ {
1557
+ "epoch": 1.6929133858267718,
1558
+ "eval_loss": 0.3484288156032562,
1559
+ "eval_runtime": 6.4482,
1560
+ "eval_samples_per_second": 25.278,
1561
+ "eval_steps_per_second": 3.257,
1562
+ "step": 215
1563
+ },
1564
+ {
1565
+ "epoch": 1.7007874015748032,
1566
+ "grad_norm": 0.1698393076658249,
1567
+ "learning_rate": 8.272784833049485e-06,
1568
+ "loss": 0.0028,
1569
+ "step": 216
1570
+ },
1571
+ {
1572
+ "epoch": 1.7086614173228347,
1573
+ "grad_norm": 0.5772718191146851,
1574
+ "learning_rate": 8.18944137442615e-06,
1575
+ "loss": 0.0082,
1576
+ "step": 217
1577
+ },
1578
+ {
1579
+ "epoch": 1.7165354330708662,
1580
+ "grad_norm": 0.09606469422578812,
1581
+ "learning_rate": 8.106227741876447e-06,
1582
+ "loss": 0.0011,
1583
+ "step": 218
1584
+ },
1585
+ {
1586
+ "epoch": 1.7244094488188977,
1587
+ "grad_norm": 0.14510361850261688,
1588
+ "learning_rate": 8.023149902232404e-06,
1589
+ "loss": 0.0015,
1590
+ "step": 219
1591
+ },
1592
+ {
1593
+ "epoch": 1.7322834645669292,
1594
+ "grad_norm": 0.055804118514060974,
1595
+ "learning_rate": 7.940213812589018e-06,
1596
+ "loss": 0.0008,
1597
+ "step": 220
1598
+ },
1599
+ {
1600
+ "epoch": 1.7401574803149606,
1601
+ "grad_norm": 0.13318321108818054,
1602
+ "learning_rate": 7.857425419877097e-06,
1603
+ "loss": 0.005,
1604
+ "step": 221
1605
+ },
1606
+ {
1607
+ "epoch": 1.7480314960629921,
1608
+ "grad_norm": 0.23600782454013824,
1609
+ "learning_rate": 7.774790660436857e-06,
1610
+ "loss": 0.0063,
1611
+ "step": 222
1612
+ },
1613
+ {
1614
+ "epoch": 1.7559055118110236,
1615
+ "grad_norm": 0.8483791351318359,
1616
+ "learning_rate": 7.69231545959223e-06,
1617
+ "loss": 0.0027,
1618
+ "step": 223
1619
+ },
1620
+ {
1621
+ "epoch": 1.763779527559055,
1622
+ "grad_norm": 0.16536197066307068,
1623
+ "learning_rate": 7.610005731226009e-06,
1624
+ "loss": 0.0039,
1625
+ "step": 224
1626
+ },
1627
+ {
1628
+ "epoch": 1.7716535433070866,
1629
+ "grad_norm": 0.14446765184402466,
1630
+ "learning_rate": 7.52786737735578e-06,
1631
+ "loss": 0.0036,
1632
+ "step": 225
1633
+ },
1634
+ {
1635
+ "epoch": 1.779527559055118,
1636
+ "grad_norm": 0.8880365490913391,
1637
+ "learning_rate": 7.445906287710733e-06,
1638
+ "loss": 0.0061,
1639
+ "step": 226
1640
+ },
1641
+ {
1642
+ "epoch": 1.7874015748031495,
1643
+ "grad_norm": 0.151743084192276,
1644
+ "learning_rate": 7.364128339309326e-06,
1645
+ "loss": 0.0028,
1646
+ "step": 227
1647
+ },
1648
+ {
1649
+ "epoch": 1.795275590551181,
1650
+ "grad_norm": 0.1224551647901535,
1651
+ "learning_rate": 7.282539396037868e-06,
1652
+ "loss": 0.002,
1653
+ "step": 228
1654
+ },
1655
+ {
1656
+ "epoch": 1.8031496062992125,
1657
+ "grad_norm": 0.4868486225605011,
1658
+ "learning_rate": 7.201145308230075e-06,
1659
+ "loss": 0.0031,
1660
+ "step": 229
1661
+ },
1662
+ {
1663
+ "epoch": 1.811023622047244,
1664
+ "grad_norm": 0.2875569462776184,
1665
+ "learning_rate": 7.119951912247545e-06,
1666
+ "loss": 0.0082,
1667
+ "step": 230
1668
+ },
1669
+ {
1670
+ "epoch": 1.8188976377952755,
1671
+ "grad_norm": 0.43524420261383057,
1672
+ "learning_rate": 7.038965030061273e-06,
1673
+ "loss": 0.0075,
1674
+ "step": 231
1675
+ },
1676
+ {
1677
+ "epoch": 1.826771653543307,
1678
+ "grad_norm": 0.39634883403778076,
1679
+ "learning_rate": 6.9581904688341854e-06,
1680
+ "loss": 0.0032,
1681
+ "step": 232
1682
+ },
1683
+ {
1684
+ "epoch": 1.8346456692913384,
1685
+ "grad_norm": 0.9809433817863464,
1686
+ "learning_rate": 6.8776340205047446e-06,
1687
+ "loss": 0.0085,
1688
+ "step": 233
1689
+ },
1690
+ {
1691
+ "epoch": 1.84251968503937,
1692
+ "grad_norm": 0.20062875747680664,
1693
+ "learning_rate": 6.797301461371626e-06,
1694
+ "loss": 0.0043,
1695
+ "step": 234
1696
+ },
1697
+ {
1698
+ "epoch": 1.8503937007874016,
1699
+ "grad_norm": 0.148948073387146,
1700
+ "learning_rate": 6.7171985516795315e-06,
1701
+ "loss": 0.0036,
1702
+ "step": 235
1703
+ },
1704
+ {
1705
+ "epoch": 1.858267716535433,
1706
+ "grad_norm": 0.15658679604530334,
1707
+ "learning_rate": 6.637331035206166e-06,
1708
+ "loss": 0.0046,
1709
+ "step": 236
1710
+ },
1711
+ {
1712
+ "epoch": 1.8661417322834646,
1713
+ "grad_norm": 0.22365815937519073,
1714
+ "learning_rate": 6.557704638850352e-06,
1715
+ "loss": 0.0081,
1716
+ "step": 237
1717
+ },
1718
+ {
1719
+ "epoch": 1.874015748031496,
1720
+ "grad_norm": 0.10596666485071182,
1721
+ "learning_rate": 6.4783250722214066e-06,
1722
+ "loss": 0.0032,
1723
+ "step": 238
1724
+ },
1725
+ {
1726
+ "epoch": 1.8818897637795275,
1727
+ "grad_norm": 0.2130754142999649,
1728
+ "learning_rate": 6.399198027229732e-06,
1729
+ "loss": 0.0056,
1730
+ "step": 239
1731
+ },
1732
+ {
1733
+ "epoch": 1.889763779527559,
1734
+ "grad_norm": 0.05641167238354683,
1735
+ "learning_rate": 6.320329177678656e-06,
1736
+ "loss": 0.0008,
1737
+ "step": 240
1738
+ },
1739
+ {
1740
+ "epoch": 1.8976377952755905,
1741
+ "grad_norm": 0.10349344462156296,
1742
+ "learning_rate": 6.241724178857621e-06,
1743
+ "loss": 0.0026,
1744
+ "step": 241
1745
+ },
1746
+ {
1747
+ "epoch": 1.905511811023622,
1748
+ "grad_norm": 0.08451675623655319,
1749
+ "learning_rate": 6.163388667136646e-06,
1750
+ "loss": 0.0016,
1751
+ "step": 242
1752
+ },
1753
+ {
1754
+ "epoch": 1.9133858267716537,
1755
+ "grad_norm": 0.13671623170375824,
1756
+ "learning_rate": 6.085328259562195e-06,
1757
+ "loss": 0.0034,
1758
+ "step": 243
1759
+ },
1760
+ {
1761
+ "epoch": 1.9212598425196852,
1762
+ "grad_norm": 0.5500523447990417,
1763
+ "learning_rate": 6.007548553454379e-06,
1764
+ "loss": 0.0028,
1765
+ "step": 244
1766
+ },
1767
+ {
1768
+ "epoch": 1.9291338582677167,
1769
+ "grad_norm": 0.06702329218387604,
1770
+ "learning_rate": 5.93005512600563e-06,
1771
+ "loss": 0.0009,
1772
+ "step": 245
1773
+ },
1774
+ {
1775
+ "epoch": 1.9370078740157481,
1776
+ "grad_norm": 0.15156973898410797,
1777
+ "learning_rate": 5.852853533880768e-06,
1778
+ "loss": 0.0064,
1779
+ "step": 246
1780
+ },
1781
+ {
1782
+ "epoch": 1.9448818897637796,
1783
+ "grad_norm": 0.2970314621925354,
1784
+ "learning_rate": 5.7759493128185584e-06,
1785
+ "loss": 0.0077,
1786
+ "step": 247
1787
+ },
1788
+ {
1789
+ "epoch": 1.952755905511811,
1790
+ "grad_norm": 0.06406261771917343,
1791
+ "learning_rate": 5.699347977234799e-06,
1792
+ "loss": 0.0006,
1793
+ "step": 248
1794
+ },
1795
+ {
1796
+ "epoch": 1.9606299212598426,
1797
+ "grad_norm": 0.2910393178462982,
1798
+ "learning_rate": 5.623055019826862e-06,
1799
+ "loss": 0.0036,
1800
+ "step": 249
1801
+ },
1802
+ {
1803
+ "epoch": 1.968503937007874,
1804
+ "grad_norm": 0.6454993486404419,
1805
+ "learning_rate": 5.547075911179902e-06,
1806
+ "loss": 0.0084,
1807
+ "step": 250
1808
+ },
1809
+ {
1810
+ "epoch": 1.9763779527559056,
1811
+ "grad_norm": 0.09460143744945526,
1812
+ "learning_rate": 5.471416099374525e-06,
1813
+ "loss": 0.0021,
1814
+ "step": 251
1815
+ },
1816
+ {
1817
+ "epoch": 1.984251968503937,
1818
+ "grad_norm": 0.2024363875389099,
1819
+ "learning_rate": 5.3960810095961705e-06,
1820
+ "loss": 0.0052,
1821
+ "step": 252
1822
+ },
1823
+ {
1824
+ "epoch": 1.9921259842519685,
1825
+ "grad_norm": 0.09423142671585083,
1826
+ "learning_rate": 5.321076043746108e-06,
1827
+ "loss": 0.0018,
1828
+ "step": 253
1829
+ },
1830
+ {
1831
+ "epoch": 2.0,
1832
+ "grad_norm": 0.1085880920290947,
1833
+ "learning_rate": 5.246406580054051e-06,
1834
+ "loss": 0.0039,
1835
+ "step": 254
1836
+ }
1837
+ ],
1838
+ "logging_steps": 1,
1839
+ "max_steps": 381,
1840
+ "num_input_tokens_seen": 0,
1841
+ "num_train_epochs": 3,
1842
+ "save_steps": 127,
1843
+ "stateful_callbacks": {
1844
+ "TrainerControl": {
1845
+ "args": {
1846
+ "should_epoch_stop": false,
1847
+ "should_evaluate": false,
1848
+ "should_log": false,
1849
+ "should_save": true,
1850
+ "should_training_stop": false
1851
+ },
1852
+ "attributes": {}
1853
+ }
1854
+ },
1855
+ "total_flos": 2.6174542139778662e+17,
1856
+ "train_batch_size": 128,
1857
+ "trial_name": null,
1858
+ "trial_params": null
1859
+ }
checkpoint-254/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:033fc2cc0303528d4e9ad523b3fd63b75e963b86dba301044379df1d98e6c394
3
+ size 10744
checkpoint-254/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-254/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
checkpoint-381/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-381/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.48.1",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151665
28
+ }
checkpoint-381/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.1"
14
+ }
checkpoint-381/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step381
checkpoint-381/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-381/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e894aee6a90801f44c9691e3440b53d94bdf748ea5d51734b11a8228b54f1784
3
+ size 3552549728
checkpoint-381/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3803bff3f596c03b55881de967a825b5734e4a581739164f9cb9e7fd1aee89
3
+ size 14512
checkpoint-381/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d768a04b798e2ca42effbe096b8e4481f32a402a9125a2ced390586dab8eb29e
3
+ size 14512
checkpoint-381/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410d31e26656fe111807307d758f91b4394aefad48a9d1d7efaa9992c522efa9
3
+ size 1064
checkpoint-381/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-381/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-381/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
checkpoint-381/trainer_state.json ADDED
@@ -0,0 +1,2772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 43,
6
+ "global_step": 381,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007874015748031496,
13
+ "grad_norm": 118.11203002929688,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 4.6099,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.007874015748031496,
20
+ "eval_loss": 3.1001100540161133,
21
+ "eval_runtime": 5.3966,
22
+ "eval_samples_per_second": 30.204,
23
+ "eval_steps_per_second": 3.891,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.015748031496062992,
28
+ "grad_norm": 118.4310302734375,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 4.5857,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.023622047244094488,
35
+ "grad_norm": 103.37439727783203,
36
+ "learning_rate": 6e-06,
37
+ "loss": 4.3069,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.031496062992125984,
42
+ "grad_norm": 75.05075073242188,
43
+ "learning_rate": 8.000000000000001e-06,
44
+ "loss": 3.8754,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.03937007874015748,
49
+ "grad_norm": 50.459983825683594,
50
+ "learning_rate": 1e-05,
51
+ "loss": 3.2841,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.047244094488188976,
56
+ "grad_norm": 47.4603385925293,
57
+ "learning_rate": 1.2e-05,
58
+ "loss": 2.4285,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.05511811023622047,
63
+ "grad_norm": 32.362667083740234,
64
+ "learning_rate": 1.4e-05,
65
+ "loss": 1.8177,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.06299212598425197,
70
+ "grad_norm": 22.846933364868164,
71
+ "learning_rate": 1.6000000000000003e-05,
72
+ "loss": 1.1567,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.07086614173228346,
77
+ "grad_norm": 17.060213088989258,
78
+ "learning_rate": 1.8e-05,
79
+ "loss": 0.8257,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.07874015748031496,
84
+ "grad_norm": 14.415579795837402,
85
+ "learning_rate": 2e-05,
86
+ "loss": 0.4257,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.08661417322834646,
91
+ "grad_norm": 7.753712177276611,
92
+ "learning_rate": 1.999964147509006e-05,
93
+ "loss": 0.2976,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.09448818897637795,
98
+ "grad_norm": 26.883708953857422,
99
+ "learning_rate": 1.9998565926068253e-05,
100
+ "loss": 0.3365,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.10236220472440945,
105
+ "grad_norm": 10.675631523132324,
106
+ "learning_rate": 1.9996773430056806e-05,
107
+ "loss": 0.2161,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.11023622047244094,
112
+ "grad_norm": 6.670111179351807,
113
+ "learning_rate": 1.999426411558661e-05,
114
+ "loss": 0.1816,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.11811023622047244,
119
+ "grad_norm": 8.878239631652832,
120
+ "learning_rate": 1.9991038162588018e-05,
121
+ "loss": 0.1567,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.12598425196850394,
126
+ "grad_norm": 2.9917383193969727,
127
+ "learning_rate": 1.9987095802377933e-05,
128
+ "loss": 0.0813,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.13385826771653545,
133
+ "grad_norm": 1.0548763275146484,
134
+ "learning_rate": 1.9982437317643218e-05,
135
+ "loss": 0.0217,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.14173228346456693,
140
+ "grad_norm": 2.8778488636016846,
141
+ "learning_rate": 1.9977063042420438e-05,
142
+ "loss": 0.0618,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.14960629921259844,
147
+ "grad_norm": 0.9811734557151794,
148
+ "learning_rate": 1.99709733620719e-05,
149
+ "loss": 0.0175,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.15748031496062992,
154
+ "grad_norm": 0.7218202948570251,
155
+ "learning_rate": 1.996416871325803e-05,
156
+ "loss": 0.0302,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.16535433070866143,
161
+ "grad_norm": 1.2746995687484741,
162
+ "learning_rate": 1.995664958390604e-05,
163
+ "loss": 0.0453,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.1732283464566929,
168
+ "grad_norm": 0.9413469433784485,
169
+ "learning_rate": 1.9948416513174976e-05,
170
+ "loss": 0.0175,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.18110236220472442,
175
+ "grad_norm": 1.4161137342453003,
176
+ "learning_rate": 1.9939470091417012e-05,
177
+ "loss": 0.0277,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.1889763779527559,
182
+ "grad_norm": 2.2721235752105713,
183
+ "learning_rate": 1.992981096013517e-05,
184
+ "loss": 0.0589,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.1968503937007874,
189
+ "grad_norm": 1.143970251083374,
190
+ "learning_rate": 1.9919439811937283e-05,
191
+ "loss": 0.0182,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.2047244094488189,
196
+ "grad_norm": 0.8054028749465942,
197
+ "learning_rate": 1.9908357390486342e-05,
198
+ "loss": 0.0211,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.2125984251968504,
203
+ "grad_norm": 1.4449081420898438,
204
+ "learning_rate": 1.989656449044718e-05,
205
+ "loss": 0.0244,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.2204724409448819,
210
+ "grad_norm": 0.49216631054878235,
211
+ "learning_rate": 1.988406195742948e-05,
212
+ "loss": 0.005,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.2283464566929134,
217
+ "grad_norm": 0.9945647716522217,
218
+ "learning_rate": 1.987085068792715e-05,
219
+ "loss": 0.0373,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.23622047244094488,
224
+ "grad_norm": 1.1753748655319214,
225
+ "learning_rate": 1.9856931629254032e-05,
226
+ "loss": 0.0217,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.2440944881889764,
231
+ "grad_norm": 0.5960403680801392,
232
+ "learning_rate": 1.984230577947597e-05,
233
+ "loss": 0.0157,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.25196850393700787,
238
+ "grad_norm": 0.3657272160053253,
239
+ "learning_rate": 1.9826974187339267e-05,
240
+ "loss": 0.0082,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.25984251968503935,
245
+ "grad_norm": 1.1290266513824463,
246
+ "learning_rate": 1.981093795219546e-05,
247
+ "loss": 0.0236,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.2677165354330709,
252
+ "grad_norm": 1.673962116241455,
253
+ "learning_rate": 1.9794198223922496e-05,
254
+ "loss": 0.0182,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.2755905511811024,
259
+ "grad_norm": 0.540355384349823,
260
+ "learning_rate": 1.9776756202842297e-05,
261
+ "loss": 0.011,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.28346456692913385,
266
+ "grad_norm": 0.3380790054798126,
267
+ "learning_rate": 1.9758613139634662e-05,
268
+ "loss": 0.0048,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.29133858267716534,
273
+ "grad_norm": 1.886232852935791,
274
+ "learning_rate": 1.9739770335247616e-05,
275
+ "loss": 0.0157,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.2992125984251969,
280
+ "grad_norm": 2.140639305114746,
281
+ "learning_rate": 1.972022914080411e-05,
282
+ "loss": 0.0393,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.30708661417322836,
287
+ "grad_norm": 0.35308870673179626,
288
+ "learning_rate": 1.9699990957505136e-05,
289
+ "loss": 0.0074,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.31496062992125984,
294
+ "grad_norm": 0.3918301463127136,
295
+ "learning_rate": 1.9679057236529266e-05,
296
+ "loss": 0.0083,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.3228346456692913,
301
+ "grad_norm": 0.4406338632106781,
302
+ "learning_rate": 1.965742947892858e-05,
303
+ "loss": 0.0152,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.33070866141732286,
308
+ "grad_norm": 0.6819682121276855,
309
+ "learning_rate": 1.9635109235521057e-05,
310
+ "loss": 0.0091,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.33858267716535434,
315
+ "grad_norm": 0.6794927716255188,
316
+ "learning_rate": 1.961209810677934e-05,
317
+ "loss": 0.0071,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.33858267716535434,
322
+ "eval_loss": 0.3895845115184784,
323
+ "eval_runtime": 6.5602,
324
+ "eval_samples_per_second": 24.847,
325
+ "eval_steps_per_second": 3.201,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.3464566929133858,
330
+ "grad_norm": 0.3874967694282532,
331
+ "learning_rate": 1.9588397742716004e-05,
332
+ "loss": 0.0089,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.3543307086614173,
337
+ "grad_norm": 0.5577577352523804,
338
+ "learning_rate": 1.9564009842765225e-05,
339
+ "loss": 0.0098,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.36220472440944884,
344
+ "grad_norm": 0.8152347207069397,
345
+ "learning_rate": 1.9538936155660934e-05,
346
+ "loss": 0.0118,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.3700787401574803,
351
+ "grad_norm": 0.2971118688583374,
352
+ "learning_rate": 1.951317847931141e-05,
353
+ "loss": 0.0084,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.3779527559055118,
358
+ "grad_norm": 1.0286651849746704,
359
+ "learning_rate": 1.9486738660670373e-05,
360
+ "loss": 0.0123,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.3858267716535433,
365
+ "grad_norm": 0.5227222442626953,
366
+ "learning_rate": 1.945961859560454e-05,
367
+ "loss": 0.0144,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.3937007874015748,
372
+ "grad_norm": 0.461935818195343,
373
+ "learning_rate": 1.943182022875769e-05,
374
+ "loss": 0.0119,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.4015748031496063,
379
+ "grad_norm": 1.2550626993179321,
380
+ "learning_rate": 1.940334555341122e-05,
381
+ "loss": 0.013,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.4094488188976378,
386
+ "grad_norm": 0.37549659609794617,
387
+ "learning_rate": 1.9374196611341212e-05,
388
+ "loss": 0.0181,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.41732283464566927,
393
+ "grad_norm": 0.3444191515445709,
394
+ "learning_rate": 1.9344375492672024e-05,
395
+ "loss": 0.0111,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.4251968503937008,
400
+ "grad_norm": 0.3489387333393097,
401
+ "learning_rate": 1.9313884335726443e-05,
402
+ "loss": 0.0111,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.4330708661417323,
407
+ "grad_norm": 0.26080814003944397,
408
+ "learning_rate": 1.9282725326872324e-05,
409
+ "loss": 0.0091,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.4409448818897638,
414
+ "grad_norm": 0.1390451341867447,
415
+ "learning_rate": 1.9250900700365837e-05,
416
+ "loss": 0.0033,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.44881889763779526,
421
+ "grad_norm": 0.20499111711978912,
422
+ "learning_rate": 1.921841273819125e-05,
423
+ "loss": 0.0066,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.4566929133858268,
428
+ "grad_norm": 2.185487747192383,
429
+ "learning_rate": 1.918526376989731e-05,
430
+ "loss": 0.0095,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.4645669291338583,
435
+ "grad_norm": 0.23939816653728485,
436
+ "learning_rate": 1.9151456172430186e-05,
437
+ "loss": 0.0048,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.47244094488188976,
442
+ "grad_norm": 0.41510018706321716,
443
+ "learning_rate": 1.911699236996305e-05,
444
+ "loss": 0.0077,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.48031496062992124,
449
+ "grad_norm": 0.264318585395813,
450
+ "learning_rate": 1.9081874833722234e-05,
451
+ "loss": 0.0129,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.4881889763779528,
456
+ "grad_norm": 1.0443968772888184,
457
+ "learning_rate": 1.9046106081810047e-05,
458
+ "loss": 0.0035,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.49606299212598426,
463
+ "grad_norm": 0.2800132632255554,
464
+ "learning_rate": 1.900968867902419e-05,
465
+ "loss": 0.0057,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.5039370078740157,
470
+ "grad_norm": 1.114960789680481,
471
+ "learning_rate": 1.8972625236673887e-05,
472
+ "loss": 0.0123,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.5118110236220472,
477
+ "grad_norm": 0.5027065873146057,
478
+ "learning_rate": 1.8934918412392596e-05,
479
+ "loss": 0.0052,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.5196850393700787,
484
+ "grad_norm": 0.5564169883728027,
485
+ "learning_rate": 1.8896570909947477e-05,
486
+ "loss": 0.0085,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.5275590551181102,
491
+ "grad_norm": 0.7567198872566223,
492
+ "learning_rate": 1.8857585479045493e-05,
493
+ "loss": 0.0054,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.5354330708661418,
498
+ "grad_norm": 0.13573969900608063,
499
+ "learning_rate": 1.8817964915136277e-05,
500
+ "loss": 0.0008,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.5433070866141733,
505
+ "grad_norm": 0.2704390287399292,
506
+ "learning_rate": 1.8777712059211643e-05,
507
+ "loss": 0.0078,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.5511811023622047,
512
+ "grad_norm": 0.6014392971992493,
513
+ "learning_rate": 1.8736829797601903e-05,
514
+ "loss": 0.0059,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.5590551181102362,
519
+ "grad_norm": 0.5487034916877747,
520
+ "learning_rate": 1.8695321061768886e-05,
521
+ "loss": 0.0097,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.5669291338582677,
526
+ "grad_norm": 0.6670834422111511,
527
+ "learning_rate": 1.8653188828095754e-05,
528
+ "loss": 0.011,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.5748031496062992,
533
+ "grad_norm": 0.1795203685760498,
534
+ "learning_rate": 1.8610436117673557e-05,
535
+ "loss": 0.0067,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.5826771653543307,
540
+ "grad_norm": 1.768436074256897,
541
+ "learning_rate": 1.8567065996084628e-05,
542
+ "loss": 0.0096,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.5905511811023622,
547
+ "grad_norm": 0.26233312487602234,
548
+ "learning_rate": 1.8523081573182754e-05,
549
+ "loss": 0.0124,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.5984251968503937,
554
+ "grad_norm": 0.3775719404220581,
555
+ "learning_rate": 1.847848600287019e-05,
556
+ "loss": 0.0052,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.6062992125984252,
561
+ "grad_norm": 1.0016565322875977,
562
+ "learning_rate": 1.8433282482871497e-05,
563
+ "loss": 0.0058,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.6141732283464567,
568
+ "grad_norm": 0.20153792202472687,
569
+ "learning_rate": 1.8387474254504265e-05,
570
+ "loss": 0.0056,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.6220472440944882,
575
+ "grad_norm": 0.5119822025299072,
576
+ "learning_rate": 1.8341064602446686e-05,
577
+ "loss": 0.0079,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.6299212598425197,
582
+ "grad_norm": 1.5781004428863525,
583
+ "learning_rate": 1.829405685450202e-05,
584
+ "loss": 0.008,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.6377952755905512,
589
+ "grad_norm": 0.23826757073402405,
590
+ "learning_rate": 1.824645438135999e-05,
591
+ "loss": 0.0041,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.6456692913385826,
596
+ "grad_norm": 0.6386727690696716,
597
+ "learning_rate": 1.8198260596355077e-05,
598
+ "loss": 0.0188,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.6535433070866141,
603
+ "grad_norm": 0.9503199458122253,
604
+ "learning_rate": 1.814947895522176e-05,
605
+ "loss": 0.008,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.6614173228346457,
610
+ "grad_norm": 0.2040701061487198,
611
+ "learning_rate": 1.8100112955846746e-05,
612
+ "loss": 0.0038,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.6692913385826772,
617
+ "grad_norm": 0.3660199046134949,
618
+ "learning_rate": 1.805016613801813e-05,
619
+ "loss": 0.0148,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.6771653543307087,
624
+ "grad_norm": 1.0502821207046509,
625
+ "learning_rate": 1.7999642083171576e-05,
626
+ "loss": 0.0098,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.6771653543307087,
631
+ "eval_loss": 0.3526817262172699,
632
+ "eval_runtime": 6.6167,
633
+ "eval_samples_per_second": 24.635,
634
+ "eval_steps_per_second": 3.174,
635
+ "step": 86
636
+ },
637
+ {
638
+ "epoch": 0.6850393700787402,
639
+ "grad_norm": 0.13735969364643097,
640
+ "learning_rate": 1.7948544414133534e-05,
641
+ "loss": 0.0022,
642
+ "step": 87
643
+ },
644
+ {
645
+ "epoch": 0.6929133858267716,
646
+ "grad_norm": 0.6425012946128845,
647
+ "learning_rate": 1.7896876794861443e-05,
648
+ "loss": 0.0086,
649
+ "step": 88
650
+ },
651
+ {
652
+ "epoch": 0.7007874015748031,
653
+ "grad_norm": 0.7540380954742432,
654
+ "learning_rate": 1.7844642930181008e-05,
655
+ "loss": 0.0062,
656
+ "step": 89
657
+ },
658
+ {
659
+ "epoch": 0.7086614173228346,
660
+ "grad_norm": 0.6727365255355835,
661
+ "learning_rate": 1.779184656552056e-05,
662
+ "loss": 0.0027,
663
+ "step": 90
664
+ },
665
+ {
666
+ "epoch": 0.7165354330708661,
667
+ "grad_norm": 0.14059337973594666,
668
+ "learning_rate": 1.773849148664247e-05,
669
+ "loss": 0.0056,
670
+ "step": 91
671
+ },
672
+ {
673
+ "epoch": 0.7244094488188977,
674
+ "grad_norm": 0.33292093873023987,
675
+ "learning_rate": 1.7684581519371714e-05,
676
+ "loss": 0.0047,
677
+ "step": 92
678
+ },
679
+ {
680
+ "epoch": 0.7322834645669292,
681
+ "grad_norm": 0.3809877932071686,
682
+ "learning_rate": 1.7630120529321518e-05,
683
+ "loss": 0.0139,
684
+ "step": 93
685
+ },
686
+ {
687
+ "epoch": 0.7401574803149606,
688
+ "grad_norm": 1.729589819908142,
689
+ "learning_rate": 1.7575112421616203e-05,
690
+ "loss": 0.0128,
691
+ "step": 94
692
+ },
693
+ {
694
+ "epoch": 0.7480314960629921,
695
+ "grad_norm": 0.18192608654499054,
696
+ "learning_rate": 1.751956114061113e-05,
697
+ "loss": 0.0025,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 0.7559055118110236,
702
+ "grad_norm": 1.0333118438720703,
703
+ "learning_rate": 1.7463470669609907e-05,
704
+ "loss": 0.006,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.7637795275590551,
709
+ "grad_norm": 0.7247685194015503,
710
+ "learning_rate": 1.7406845030578747e-05,
711
+ "loss": 0.0073,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.7716535433070866,
716
+ "grad_norm": 0.06979379802942276,
717
+ "learning_rate": 1.734968828385808e-05,
718
+ "loss": 0.0005,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.7795275590551181,
723
+ "grad_norm": 0.5137119293212891,
724
+ "learning_rate": 1.729200452787139e-05,
725
+ "loss": 0.0082,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.7874015748031497,
730
+ "grad_norm": 0.4704137146472931,
731
+ "learning_rate": 1.7233797898831376e-05,
732
+ "loss": 0.005,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.7952755905511811,
737
+ "grad_norm": 0.28564465045928955,
738
+ "learning_rate": 1.717507257044331e-05,
739
+ "loss": 0.0052,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.8031496062992126,
744
+ "grad_norm": 0.17685537040233612,
745
+ "learning_rate": 1.711583275360582e-05,
746
+ "loss": 0.0024,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.8110236220472441,
751
+ "grad_norm": 0.45714935660362244,
752
+ "learning_rate": 1.7056082696108896e-05,
753
+ "loss": 0.0072,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.8188976377952756,
758
+ "grad_norm": 0.4373086988925934,
759
+ "learning_rate": 1.699582668232934e-05,
760
+ "loss": 0.0051,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.8267716535433071,
765
+ "grad_norm": 0.8478983640670776,
766
+ "learning_rate": 1.6935069032923525e-05,
767
+ "loss": 0.022,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.8346456692913385,
772
+ "grad_norm": 0.16181086003780365,
773
+ "learning_rate": 1.6873814104517617e-05,
774
+ "loss": 0.0058,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.84251968503937,
779
+ "grad_norm": 0.09503592550754547,
780
+ "learning_rate": 1.6812066289395157e-05,
781
+ "loss": 0.0009,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.8503937007874016,
786
+ "grad_norm": 0.7462632060050964,
787
+ "learning_rate": 1.6749830015182106e-05,
788
+ "loss": 0.0044,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.8582677165354331,
793
+ "grad_norm": 0.07221701741218567,
794
+ "learning_rate": 1.6687109744529394e-05,
795
+ "loss": 0.0015,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.8661417322834646,
800
+ "grad_norm": 0.08999036252498627,
801
+ "learning_rate": 1.6623909974792888e-05,
802
+ "loss": 0.0023,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.8740157480314961,
807
+ "grad_norm": 0.42536938190460205,
808
+ "learning_rate": 1.656023523771095e-05,
809
+ "loss": 0.005,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.8818897637795275,
814
+ "grad_norm": 0.7885191440582275,
815
+ "learning_rate": 1.6496090099079452e-05,
816
+ "loss": 0.0103,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.889763779527559,
821
+ "grad_norm": 0.16610018908977509,
822
+ "learning_rate": 1.64314791584244e-05,
823
+ "loss": 0.006,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.8976377952755905,
828
+ "grad_norm": 0.32151034474372864,
829
+ "learning_rate": 1.6366407048672135e-05,
830
+ "loss": 0.0086,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.905511811023622,
835
+ "grad_norm": 0.557732343673706,
836
+ "learning_rate": 1.6300878435817115e-05,
837
+ "loss": 0.0064,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.9133858267716536,
842
+ "grad_norm": 0.2238176167011261,
843
+ "learning_rate": 1.6234898018587336e-05,
844
+ "loss": 0.0065,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.9212598425196851,
849
+ "grad_norm": 0.2980042099952698,
850
+ "learning_rate": 1.616847052810744e-05,
851
+ "loss": 0.0095,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.9291338582677166,
856
+ "grad_norm": 0.1529705822467804,
857
+ "learning_rate": 1.6101600727559423e-05,
858
+ "loss": 0.0062,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.937007874015748,
863
+ "grad_norm": 0.017149658873677254,
864
+ "learning_rate": 1.603429341184114e-05,
865
+ "loss": 0.0002,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.9448818897637795,
870
+ "grad_norm": 0.4514746367931366,
871
+ "learning_rate": 1.596655340722244e-05,
872
+ "loss": 0.0067,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.952755905511811,
877
+ "grad_norm": 0.11766134947538376,
878
+ "learning_rate": 1.5898385570999146e-05,
879
+ "loss": 0.0053,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.9606299212598425,
884
+ "grad_norm": 0.4089784026145935,
885
+ "learning_rate": 1.5829794791144723e-05,
886
+ "loss": 0.0085,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.968503937007874,
891
+ "grad_norm": 0.1353057473897934,
892
+ "learning_rate": 1.57607859859598e-05,
893
+ "loss": 0.0013,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.9763779527559056,
898
+ "grad_norm": 0.6548481583595276,
899
+ "learning_rate": 1.5691364103719515e-05,
900
+ "loss": 0.0117,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.984251968503937,
905
+ "grad_norm": 0.1571267992258072,
906
+ "learning_rate": 1.5621534122318682e-05,
907
+ "loss": 0.0049,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.9921259842519685,
912
+ "grad_norm": 1.2177189588546753,
913
+ "learning_rate": 1.5551301048914863e-05,
914
+ "loss": 0.0161,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 1.0,
919
+ "grad_norm": 0.414489209651947,
920
+ "learning_rate": 1.5480669919569313e-05,
921
+ "loss": 0.0181,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 1.0078740157480315,
926
+ "grad_norm": 0.10985995829105377,
927
+ "learning_rate": 1.54096457988859e-05,
928
+ "loss": 0.0049,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 1.015748031496063,
933
+ "grad_norm": 0.12780147790908813,
934
+ "learning_rate": 1.533823377964791e-05,
935
+ "loss": 0.0026,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 1.015748031496063,
940
+ "eval_loss": 0.33064374327659607,
941
+ "eval_runtime": 6.9286,
942
+ "eval_samples_per_second": 23.526,
943
+ "eval_steps_per_second": 3.031,
944
+ "step": 129
945
+ },
946
+ {
947
+ "epoch": 1.0236220472440944,
948
+ "grad_norm": 0.5142458081245422,
949
+ "learning_rate": 1.52664389824529e-05,
950
+ "loss": 0.0082,
951
+ "step": 130
952
+ },
953
+ {
954
+ "epoch": 1.031496062992126,
955
+ "grad_norm": 0.15617145597934723,
956
+ "learning_rate": 1.5194266555345505e-05,
957
+ "loss": 0.0016,
958
+ "step": 131
959
+ },
960
+ {
961
+ "epoch": 1.0393700787401574,
962
+ "grad_norm": 0.5782387852668762,
963
+ "learning_rate": 1.5121721673448319e-05,
964
+ "loss": 0.0117,
965
+ "step": 132
966
+ },
967
+ {
968
+ "epoch": 1.047244094488189,
969
+ "grad_norm": 0.08414836972951889,
970
+ "learning_rate": 1.5048809538590789e-05,
971
+ "loss": 0.0021,
972
+ "step": 133
973
+ },
974
+ {
975
+ "epoch": 1.0551181102362204,
976
+ "grad_norm": 0.28253939747810364,
977
+ "learning_rate": 1.4975535378936228e-05,
978
+ "loss": 0.0055,
979
+ "step": 134
980
+ },
981
+ {
982
+ "epoch": 1.0629921259842519,
983
+ "grad_norm": 0.47917842864990234,
984
+ "learning_rate": 1.490190444860694e-05,
985
+ "loss": 0.0046,
986
+ "step": 135
987
+ },
988
+ {
989
+ "epoch": 1.0708661417322836,
990
+ "grad_norm": 0.1895662248134613,
991
+ "learning_rate": 1.482792202730745e-05,
992
+ "loss": 0.006,
993
+ "step": 136
994
+ },
995
+ {
996
+ "epoch": 1.078740157480315,
997
+ "grad_norm": 0.13722768425941467,
998
+ "learning_rate": 1.475359341994595e-05,
999
+ "loss": 0.0031,
1000
+ "step": 137
1001
+ },
1002
+ {
1003
+ "epoch": 1.0866141732283465,
1004
+ "grad_norm": 0.10731153190135956,
1005
+ "learning_rate": 1.4678923956253894e-05,
1006
+ "loss": 0.0005,
1007
+ "step": 138
1008
+ },
1009
+ {
1010
+ "epoch": 1.094488188976378,
1011
+ "grad_norm": 0.12261265516281128,
1012
+ "learning_rate": 1.460391899040383e-05,
1013
+ "loss": 0.0031,
1014
+ "step": 139
1015
+ },
1016
+ {
1017
+ "epoch": 1.1023622047244095,
1018
+ "grad_norm": 0.0038245893083512783,
1019
+ "learning_rate": 1.4528583900625481e-05,
1020
+ "loss": 0.0,
1021
+ "step": 140
1022
+ },
1023
+ {
1024
+ "epoch": 1.110236220472441,
1025
+ "grad_norm": 0.28762558102607727,
1026
+ "learning_rate": 1.4452924088820101e-05,
1027
+ "loss": 0.004,
1028
+ "step": 141
1029
+ },
1030
+ {
1031
+ "epoch": 1.1181102362204725,
1032
+ "grad_norm": 0.17267552018165588,
1033
+ "learning_rate": 1.4376944980173138e-05,
1034
+ "loss": 0.0002,
1035
+ "step": 142
1036
+ },
1037
+ {
1038
+ "epoch": 1.125984251968504,
1039
+ "grad_norm": 0.12727122008800507,
1040
+ "learning_rate": 1.4300652022765207e-05,
1041
+ "loss": 0.0029,
1042
+ "step": 143
1043
+ },
1044
+ {
1045
+ "epoch": 1.1338582677165354,
1046
+ "grad_norm": 0.25049135088920593,
1047
+ "learning_rate": 1.4224050687181442e-05,
1048
+ "loss": 0.0108,
1049
+ "step": 144
1050
+ },
1051
+ {
1052
+ "epoch": 1.141732283464567,
1053
+ "grad_norm": 0.16092728078365326,
1054
+ "learning_rate": 1.4147146466119235e-05,
1055
+ "loss": 0.0024,
1056
+ "step": 145
1057
+ },
1058
+ {
1059
+ "epoch": 1.1496062992125984,
1060
+ "grad_norm": 0.13642658293247223,
1061
+ "learning_rate": 1.406994487399437e-05,
1062
+ "loss": 0.0037,
1063
+ "step": 146
1064
+ },
1065
+ {
1066
+ "epoch": 1.1574803149606299,
1067
+ "grad_norm": 0.9029403328895569,
1068
+ "learning_rate": 1.3992451446545624e-05,
1069
+ "loss": 0.0034,
1070
+ "step": 147
1071
+ },
1072
+ {
1073
+ "epoch": 1.1653543307086613,
1074
+ "grad_norm": 0.19518424570560455,
1075
+ "learning_rate": 1.3914671740437811e-05,
1076
+ "loss": 0.0057,
1077
+ "step": 148
1078
+ },
1079
+ {
1080
+ "epoch": 1.1732283464566928,
1081
+ "grad_norm": 0.12140502035617828,
1082
+ "learning_rate": 1.3836611332863356e-05,
1083
+ "loss": 0.0041,
1084
+ "step": 149
1085
+ },
1086
+ {
1087
+ "epoch": 1.1811023622047245,
1088
+ "grad_norm": 0.5148038864135742,
1089
+ "learning_rate": 1.3758275821142382e-05,
1090
+ "loss": 0.0026,
1091
+ "step": 150
1092
+ },
1093
+ {
1094
+ "epoch": 1.188976377952756,
1095
+ "grad_norm": 1.828904390335083,
1096
+ "learning_rate": 1.3679670822321347e-05,
1097
+ "loss": 0.0024,
1098
+ "step": 151
1099
+ },
1100
+ {
1101
+ "epoch": 1.1968503937007875,
1102
+ "grad_norm": 0.3571717143058777,
1103
+ "learning_rate": 1.3600801972770272e-05,
1104
+ "loss": 0.0106,
1105
+ "step": 152
1106
+ },
1107
+ {
1108
+ "epoch": 1.204724409448819,
1109
+ "grad_norm": 0.051027003675699234,
1110
+ "learning_rate": 1.3521674927778594e-05,
1111
+ "loss": 0.0003,
1112
+ "step": 153
1113
+ },
1114
+ {
1115
+ "epoch": 1.2125984251968505,
1116
+ "grad_norm": 0.6490982174873352,
1117
+ "learning_rate": 1.3442295361149651e-05,
1118
+ "loss": 0.0035,
1119
+ "step": 154
1120
+ },
1121
+ {
1122
+ "epoch": 1.220472440944882,
1123
+ "grad_norm": 0.08408445864915848,
1124
+ "learning_rate": 1.336266896479384e-05,
1125
+ "loss": 0.0027,
1126
+ "step": 155
1127
+ },
1128
+ {
1129
+ "epoch": 1.2283464566929134,
1130
+ "grad_norm": 0.09666562080383301,
1131
+ "learning_rate": 1.328280144832047e-05,
1132
+ "loss": 0.0019,
1133
+ "step": 156
1134
+ },
1135
+ {
1136
+ "epoch": 1.236220472440945,
1137
+ "grad_norm": 0.03880690038204193,
1138
+ "learning_rate": 1.3202698538628376e-05,
1139
+ "loss": 0.0003,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 1.2440944881889764,
1144
+ "grad_norm": 0.11940775066614151,
1145
+ "learning_rate": 1.3122365979495259e-05,
1146
+ "loss": 0.0024,
1147
+ "step": 158
1148
+ },
1149
+ {
1150
+ "epoch": 1.2519685039370079,
1151
+ "grad_norm": 0.1442880481481552,
1152
+ "learning_rate": 1.3041809531165819e-05,
1153
+ "loss": 0.0015,
1154
+ "step": 159
1155
+ },
1156
+ {
1157
+ "epoch": 1.2598425196850394,
1158
+ "grad_norm": 0.1961939036846161,
1159
+ "learning_rate": 1.2961034969938732e-05,
1160
+ "loss": 0.0056,
1161
+ "step": 160
1162
+ },
1163
+ {
1164
+ "epoch": 1.2677165354330708,
1165
+ "grad_norm": 0.26947638392448425,
1166
+ "learning_rate": 1.288004808775246e-05,
1167
+ "loss": 0.0028,
1168
+ "step": 161
1169
+ },
1170
+ {
1171
+ "epoch": 1.2755905511811023,
1172
+ "grad_norm": 0.5154056549072266,
1173
+ "learning_rate": 1.2798854691769927e-05,
1174
+ "loss": 0.0037,
1175
+ "step": 162
1176
+ },
1177
+ {
1178
+ "epoch": 1.2834645669291338,
1179
+ "grad_norm": 0.4292369782924652,
1180
+ "learning_rate": 1.2717460603962132e-05,
1181
+ "loss": 0.0029,
1182
+ "step": 163
1183
+ },
1184
+ {
1185
+ "epoch": 1.2913385826771653,
1186
+ "grad_norm": 0.19139212369918823,
1187
+ "learning_rate": 1.2635871660690677e-05,
1188
+ "loss": 0.0061,
1189
+ "step": 164
1190
+ },
1191
+ {
1192
+ "epoch": 1.2992125984251968,
1193
+ "grad_norm": 0.19960306584835052,
1194
+ "learning_rate": 1.2554093712289267e-05,
1195
+ "loss": 0.005,
1196
+ "step": 165
1197
+ },
1198
+ {
1199
+ "epoch": 1.3070866141732282,
1200
+ "grad_norm": 0.4523830711841583,
1201
+ "learning_rate": 1.2472132622644222e-05,
1202
+ "loss": 0.0065,
1203
+ "step": 166
1204
+ },
1205
+ {
1206
+ "epoch": 1.3149606299212597,
1207
+ "grad_norm": 0.49343299865722656,
1208
+ "learning_rate": 1.2389994268773995e-05,
1209
+ "loss": 0.0061,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 1.3228346456692912,
1214
+ "grad_norm": 0.01938088797032833,
1215
+ "learning_rate": 1.2307684540407775e-05,
1216
+ "loss": 0.0001,
1217
+ "step": 168
1218
+ },
1219
+ {
1220
+ "epoch": 1.330708661417323,
1221
+ "grad_norm": 0.3082112669944763,
1222
+ "learning_rate": 1.2225209339563144e-05,
1223
+ "loss": 0.0053,
1224
+ "step": 169
1225
+ },
1226
+ {
1227
+ "epoch": 1.3385826771653544,
1228
+ "grad_norm": 0.01982509344816208,
1229
+ "learning_rate": 1.2142574580122903e-05,
1230
+ "loss": 0.0001,
1231
+ "step": 170
1232
+ },
1233
+ {
1234
+ "epoch": 1.3464566929133859,
1235
+ "grad_norm": 0.12388588488101959,
1236
+ "learning_rate": 1.2059786187410984e-05,
1237
+ "loss": 0.0049,
1238
+ "step": 171
1239
+ },
1240
+ {
1241
+ "epoch": 1.3543307086614174,
1242
+ "grad_norm": 0.43759095668792725,
1243
+ "learning_rate": 1.1976850097767598e-05,
1244
+ "loss": 0.0128,
1245
+ "step": 172
1246
+ },
1247
+ {
1248
+ "epoch": 1.3543307086614174,
1249
+ "eval_loss": 0.3166251480579376,
1250
+ "eval_runtime": 6.9515,
1251
+ "eval_samples_per_second": 23.448,
1252
+ "eval_steps_per_second": 3.021,
1253
+ "step": 172
1254
+ },
1255
+ {
1256
+ "epoch": 1.3622047244094488,
1257
+ "grad_norm": 0.46561670303344727,
1258
+ "learning_rate": 1.1893772258123554e-05,
1259
+ "loss": 0.008,
1260
+ "step": 173
1261
+ },
1262
+ {
1263
+ "epoch": 1.3700787401574803,
1264
+ "grad_norm": 0.16612188518047333,
1265
+ "learning_rate": 1.1810558625573856e-05,
1266
+ "loss": 0.0024,
1267
+ "step": 174
1268
+ },
1269
+ {
1270
+ "epoch": 1.3779527559055118,
1271
+ "grad_norm": 0.13628093898296356,
1272
+ "learning_rate": 1.1727215166950519e-05,
1273
+ "loss": 0.0045,
1274
+ "step": 175
1275
+ },
1276
+ {
1277
+ "epoch": 1.3858267716535433,
1278
+ "grad_norm": 0.565229058265686,
1279
+ "learning_rate": 1.1643747858394743e-05,
1280
+ "loss": 0.0103,
1281
+ "step": 176
1282
+ },
1283
+ {
1284
+ "epoch": 1.3937007874015748,
1285
+ "grad_norm": 0.14550763368606567,
1286
+ "learning_rate": 1.156016268492839e-05,
1287
+ "loss": 0.0028,
1288
+ "step": 177
1289
+ },
1290
+ {
1291
+ "epoch": 1.4015748031496063,
1292
+ "grad_norm": 0.12460129708051682,
1293
+ "learning_rate": 1.1476465640024814e-05,
1294
+ "loss": 0.0031,
1295
+ "step": 178
1296
+ },
1297
+ {
1298
+ "epoch": 1.4094488188976377,
1299
+ "grad_norm": 0.19089221954345703,
1300
+ "learning_rate": 1.1392662725179114e-05,
1301
+ "loss": 0.0035,
1302
+ "step": 179
1303
+ },
1304
+ {
1305
+ "epoch": 1.4173228346456692,
1306
+ "grad_norm": 0.6106573343276978,
1307
+ "learning_rate": 1.1308759949477786e-05,
1308
+ "loss": 0.0088,
1309
+ "step": 180
1310
+ },
1311
+ {
1312
+ "epoch": 1.425196850393701,
1313
+ "grad_norm": 0.20053207874298096,
1314
+ "learning_rate": 1.1224763329167859e-05,
1315
+ "loss": 0.0033,
1316
+ "step": 181
1317
+ },
1318
+ {
1319
+ "epoch": 1.4330708661417324,
1320
+ "grad_norm": 0.1984691321849823,
1321
+ "learning_rate": 1.1140678887225468e-05,
1322
+ "loss": 0.0051,
1323
+ "step": 182
1324
+ },
1325
+ {
1326
+ "epoch": 1.4409448818897639,
1327
+ "grad_norm": 0.19264858961105347,
1328
+ "learning_rate": 1.1056512652924014e-05,
1329
+ "loss": 0.0046,
1330
+ "step": 183
1331
+ },
1332
+ {
1333
+ "epoch": 1.4488188976377954,
1334
+ "grad_norm": 0.10979076474905014,
1335
+ "learning_rate": 1.0972270661401812e-05,
1336
+ "loss": 0.0031,
1337
+ "step": 184
1338
+ },
1339
+ {
1340
+ "epoch": 1.4566929133858268,
1341
+ "grad_norm": 0.1744084656238556,
1342
+ "learning_rate": 1.0887958953229349e-05,
1343
+ "loss": 0.0024,
1344
+ "step": 185
1345
+ },
1346
+ {
1347
+ "epoch": 1.4645669291338583,
1348
+ "grad_norm": 0.20646224915981293,
1349
+ "learning_rate": 1.0803583573976137e-05,
1350
+ "loss": 0.008,
1351
+ "step": 186
1352
+ },
1353
+ {
1354
+ "epoch": 1.4724409448818898,
1355
+ "grad_norm": 0.14391584694385529,
1356
+ "learning_rate": 1.0719150573777226e-05,
1357
+ "loss": 0.004,
1358
+ "step": 187
1359
+ },
1360
+ {
1361
+ "epoch": 1.4803149606299213,
1362
+ "grad_norm": 0.36887863278388977,
1363
+ "learning_rate": 1.0634666006899375e-05,
1364
+ "loss": 0.0074,
1365
+ "step": 188
1366
+ },
1367
+ {
1368
+ "epoch": 1.4881889763779528,
1369
+ "grad_norm": 0.21352627873420715,
1370
+ "learning_rate": 1.055013593130693e-05,
1371
+ "loss": 0.0082,
1372
+ "step": 189
1373
+ },
1374
+ {
1375
+ "epoch": 1.4960629921259843,
1376
+ "grad_norm": 0.22443020343780518,
1377
+ "learning_rate": 1.046556640822744e-05,
1378
+ "loss": 0.0087,
1379
+ "step": 190
1380
+ },
1381
+ {
1382
+ "epoch": 1.5039370078740157,
1383
+ "grad_norm": 0.4243764281272888,
1384
+ "learning_rate": 1.0380963501717034e-05,
1385
+ "loss": 0.0068,
1386
+ "step": 191
1387
+ },
1388
+ {
1389
+ "epoch": 1.5118110236220472,
1390
+ "grad_norm": 0.17558562755584717,
1391
+ "learning_rate": 1.0296333278225599e-05,
1392
+ "loss": 0.0054,
1393
+ "step": 192
1394
+ },
1395
+ {
1396
+ "epoch": 1.5196850393700787,
1397
+ "grad_norm": 0.14842620491981506,
1398
+ "learning_rate": 1.0211681806161787e-05,
1399
+ "loss": 0.0031,
1400
+ "step": 193
1401
+ },
1402
+ {
1403
+ "epoch": 1.5275590551181102,
1404
+ "grad_norm": 0.09316081553697586,
1405
+ "learning_rate": 1.0127015155457875e-05,
1406
+ "loss": 0.0013,
1407
+ "step": 194
1408
+ },
1409
+ {
1410
+ "epoch": 1.5354330708661417,
1411
+ "grad_norm": 0.19795025885105133,
1412
+ "learning_rate": 1.0042339397134528e-05,
1413
+ "loss": 0.0051,
1414
+ "step": 195
1415
+ },
1416
+ {
1417
+ "epoch": 1.5433070866141732,
1418
+ "grad_norm": 0.21606990694999695,
1419
+ "learning_rate": 9.957660602865477e-06,
1420
+ "loss": 0.0041,
1421
+ "step": 196
1422
+ },
1423
+ {
1424
+ "epoch": 1.5511811023622046,
1425
+ "grad_norm": 0.18036173284053802,
1426
+ "learning_rate": 9.872984844542128e-06,
1427
+ "loss": 0.0037,
1428
+ "step": 197
1429
+ },
1430
+ {
1431
+ "epoch": 1.5590551181102361,
1432
+ "grad_norm": 0.18953870236873627,
1433
+ "learning_rate": 9.788318193838218e-06,
1434
+ "loss": 0.0041,
1435
+ "step": 198
1436
+ },
1437
+ {
1438
+ "epoch": 1.5669291338582676,
1439
+ "grad_norm": 0.12346503138542175,
1440
+ "learning_rate": 9.703666721774403e-06,
1441
+ "loss": 0.0035,
1442
+ "step": 199
1443
+ },
1444
+ {
1445
+ "epoch": 1.574803149606299,
1446
+ "grad_norm": 0.4576225280761719,
1447
+ "learning_rate": 9.619036498282968e-06,
1448
+ "loss": 0.0041,
1449
+ "step": 200
1450
+ },
1451
+ {
1452
+ "epoch": 1.5826771653543306,
1453
+ "grad_norm": 0.10333681106567383,
1454
+ "learning_rate": 9.534433591772562e-06,
1455
+ "loss": 0.0011,
1456
+ "step": 201
1457
+ },
1458
+ {
1459
+ "epoch": 1.590551181102362,
1460
+ "grad_norm": 0.19167865812778473,
1461
+ "learning_rate": 9.449864068693072e-06,
1462
+ "loss": 0.0062,
1463
+ "step": 202
1464
+ },
1465
+ {
1466
+ "epoch": 1.5984251968503937,
1467
+ "grad_norm": 0.2258184254169464,
1468
+ "learning_rate": 9.365333993100628e-06,
1469
+ "loss": 0.003,
1470
+ "step": 203
1471
+ },
1472
+ {
1473
+ "epoch": 1.6062992125984252,
1474
+ "grad_norm": 0.07945302873849869,
1475
+ "learning_rate": 9.280849426222778e-06,
1476
+ "loss": 0.0008,
1477
+ "step": 204
1478
+ },
1479
+ {
1480
+ "epoch": 1.6141732283464567,
1481
+ "grad_norm": 0.17767398059368134,
1482
+ "learning_rate": 9.196416426023868e-06,
1483
+ "loss": 0.0053,
1484
+ "step": 205
1485
+ },
1486
+ {
1487
+ "epoch": 1.6220472440944882,
1488
+ "grad_norm": 0.12704500555992126,
1489
+ "learning_rate": 9.112041046770653e-06,
1490
+ "loss": 0.0023,
1491
+ "step": 206
1492
+ },
1493
+ {
1494
+ "epoch": 1.6299212598425197,
1495
+ "grad_norm": 0.4054742753505707,
1496
+ "learning_rate": 9.027729338598188e-06,
1497
+ "loss": 0.0045,
1498
+ "step": 207
1499
+ },
1500
+ {
1501
+ "epoch": 1.6377952755905512,
1502
+ "grad_norm": 0.4463757574558258,
1503
+ "learning_rate": 8.943487347075988e-06,
1504
+ "loss": 0.007,
1505
+ "step": 208
1506
+ },
1507
+ {
1508
+ "epoch": 1.6456692913385826,
1509
+ "grad_norm": 0.6517045497894287,
1510
+ "learning_rate": 8.859321112774535e-06,
1511
+ "loss": 0.0052,
1512
+ "step": 209
1513
+ },
1514
+ {
1515
+ "epoch": 1.6535433070866141,
1516
+ "grad_norm": 0.1542089730501175,
1517
+ "learning_rate": 8.775236670832146e-06,
1518
+ "loss": 0.0047,
1519
+ "step": 210
1520
+ },
1521
+ {
1522
+ "epoch": 1.6614173228346458,
1523
+ "grad_norm": 0.14716440439224243,
1524
+ "learning_rate": 8.691240050522215e-06,
1525
+ "loss": 0.0049,
1526
+ "step": 211
1527
+ },
1528
+ {
1529
+ "epoch": 1.6692913385826773,
1530
+ "grad_norm": 0.2997347116470337,
1531
+ "learning_rate": 8.607337274820888e-06,
1532
+ "loss": 0.0076,
1533
+ "step": 212
1534
+ },
1535
+ {
1536
+ "epoch": 1.6771653543307088,
1537
+ "grad_norm": 0.22548256814479828,
1538
+ "learning_rate": 8.52353435997519e-06,
1539
+ "loss": 0.0063,
1540
+ "step": 213
1541
+ },
1542
+ {
1543
+ "epoch": 1.6850393700787403,
1544
+ "grad_norm": 0.7220733165740967,
1545
+ "learning_rate": 8.439837315071612e-06,
1546
+ "loss": 0.0089,
1547
+ "step": 214
1548
+ },
1549
+ {
1550
+ "epoch": 1.6929133858267718,
1551
+ "grad_norm": 0.5101618766784668,
1552
+ "learning_rate": 8.35625214160526e-06,
1553
+ "loss": 0.0042,
1554
+ "step": 215
1555
+ },
1556
+ {
1557
+ "epoch": 1.6929133858267718,
1558
+ "eval_loss": 0.3484288156032562,
1559
+ "eval_runtime": 6.4482,
1560
+ "eval_samples_per_second": 25.278,
1561
+ "eval_steps_per_second": 3.257,
1562
+ "step": 215
1563
+ },
1564
+ {
1565
+ "epoch": 1.7007874015748032,
1566
+ "grad_norm": 0.1698393076658249,
1567
+ "learning_rate": 8.272784833049485e-06,
1568
+ "loss": 0.0028,
1569
+ "step": 216
1570
+ },
1571
+ {
1572
+ "epoch": 1.7086614173228347,
1573
+ "grad_norm": 0.5772718191146851,
1574
+ "learning_rate": 8.18944137442615e-06,
1575
+ "loss": 0.0082,
1576
+ "step": 217
1577
+ },
1578
+ {
1579
+ "epoch": 1.7165354330708662,
1580
+ "grad_norm": 0.09606469422578812,
1581
+ "learning_rate": 8.106227741876447e-06,
1582
+ "loss": 0.0011,
1583
+ "step": 218
1584
+ },
1585
+ {
1586
+ "epoch": 1.7244094488188977,
1587
+ "grad_norm": 0.14510361850261688,
1588
+ "learning_rate": 8.023149902232404e-06,
1589
+ "loss": 0.0015,
1590
+ "step": 219
1591
+ },
1592
+ {
1593
+ "epoch": 1.7322834645669292,
1594
+ "grad_norm": 0.055804118514060974,
1595
+ "learning_rate": 7.940213812589018e-06,
1596
+ "loss": 0.0008,
1597
+ "step": 220
1598
+ },
1599
+ {
1600
+ "epoch": 1.7401574803149606,
1601
+ "grad_norm": 0.13318321108818054,
1602
+ "learning_rate": 7.857425419877097e-06,
1603
+ "loss": 0.005,
1604
+ "step": 221
1605
+ },
1606
+ {
1607
+ "epoch": 1.7480314960629921,
1608
+ "grad_norm": 0.23600782454013824,
1609
+ "learning_rate": 7.774790660436857e-06,
1610
+ "loss": 0.0063,
1611
+ "step": 222
1612
+ },
1613
+ {
1614
+ "epoch": 1.7559055118110236,
1615
+ "grad_norm": 0.8483791351318359,
1616
+ "learning_rate": 7.69231545959223e-06,
1617
+ "loss": 0.0027,
1618
+ "step": 223
1619
+ },
1620
+ {
1621
+ "epoch": 1.763779527559055,
1622
+ "grad_norm": 0.16536197066307068,
1623
+ "learning_rate": 7.610005731226009e-06,
1624
+ "loss": 0.0039,
1625
+ "step": 224
1626
+ },
1627
+ {
1628
+ "epoch": 1.7716535433070866,
1629
+ "grad_norm": 0.14446765184402466,
1630
+ "learning_rate": 7.52786737735578e-06,
1631
+ "loss": 0.0036,
1632
+ "step": 225
1633
+ },
1634
+ {
1635
+ "epoch": 1.779527559055118,
1636
+ "grad_norm": 0.8880365490913391,
1637
+ "learning_rate": 7.445906287710733e-06,
1638
+ "loss": 0.0061,
1639
+ "step": 226
1640
+ },
1641
+ {
1642
+ "epoch": 1.7874015748031495,
1643
+ "grad_norm": 0.151743084192276,
1644
+ "learning_rate": 7.364128339309326e-06,
1645
+ "loss": 0.0028,
1646
+ "step": 227
1647
+ },
1648
+ {
1649
+ "epoch": 1.795275590551181,
1650
+ "grad_norm": 0.1224551647901535,
1651
+ "learning_rate": 7.282539396037868e-06,
1652
+ "loss": 0.002,
1653
+ "step": 228
1654
+ },
1655
+ {
1656
+ "epoch": 1.8031496062992125,
1657
+ "grad_norm": 0.4868486225605011,
1658
+ "learning_rate": 7.201145308230075e-06,
1659
+ "loss": 0.0031,
1660
+ "step": 229
1661
+ },
1662
+ {
1663
+ "epoch": 1.811023622047244,
1664
+ "grad_norm": 0.2875569462776184,
1665
+ "learning_rate": 7.119951912247545e-06,
1666
+ "loss": 0.0082,
1667
+ "step": 230
1668
+ },
1669
+ {
1670
+ "epoch": 1.8188976377952755,
1671
+ "grad_norm": 0.43524420261383057,
1672
+ "learning_rate": 7.038965030061273e-06,
1673
+ "loss": 0.0075,
1674
+ "step": 231
1675
+ },
1676
+ {
1677
+ "epoch": 1.826771653543307,
1678
+ "grad_norm": 0.39634883403778076,
1679
+ "learning_rate": 6.9581904688341854e-06,
1680
+ "loss": 0.0032,
1681
+ "step": 232
1682
+ },
1683
+ {
1684
+ "epoch": 1.8346456692913384,
1685
+ "grad_norm": 0.9809433817863464,
1686
+ "learning_rate": 6.8776340205047446e-06,
1687
+ "loss": 0.0085,
1688
+ "step": 233
1689
+ },
1690
+ {
1691
+ "epoch": 1.84251968503937,
1692
+ "grad_norm": 0.20062875747680664,
1693
+ "learning_rate": 6.797301461371626e-06,
1694
+ "loss": 0.0043,
1695
+ "step": 234
1696
+ },
1697
+ {
1698
+ "epoch": 1.8503937007874016,
1699
+ "grad_norm": 0.148948073387146,
1700
+ "learning_rate": 6.7171985516795315e-06,
1701
+ "loss": 0.0036,
1702
+ "step": 235
1703
+ },
1704
+ {
1705
+ "epoch": 1.858267716535433,
1706
+ "grad_norm": 0.15658679604530334,
1707
+ "learning_rate": 6.637331035206166e-06,
1708
+ "loss": 0.0046,
1709
+ "step": 236
1710
+ },
1711
+ {
1712
+ "epoch": 1.8661417322834646,
1713
+ "grad_norm": 0.22365815937519073,
1714
+ "learning_rate": 6.557704638850352e-06,
1715
+ "loss": 0.0081,
1716
+ "step": 237
1717
+ },
1718
+ {
1719
+ "epoch": 1.874015748031496,
1720
+ "grad_norm": 0.10596666485071182,
1721
+ "learning_rate": 6.4783250722214066e-06,
1722
+ "loss": 0.0032,
1723
+ "step": 238
1724
+ },
1725
+ {
1726
+ "epoch": 1.8818897637795275,
1727
+ "grad_norm": 0.2130754142999649,
1728
+ "learning_rate": 6.399198027229732e-06,
1729
+ "loss": 0.0056,
1730
+ "step": 239
1731
+ },
1732
+ {
1733
+ "epoch": 1.889763779527559,
1734
+ "grad_norm": 0.05641167238354683,
1735
+ "learning_rate": 6.320329177678656e-06,
1736
+ "loss": 0.0008,
1737
+ "step": 240
1738
+ },
1739
+ {
1740
+ "epoch": 1.8976377952755905,
1741
+ "grad_norm": 0.10349344462156296,
1742
+ "learning_rate": 6.241724178857621e-06,
1743
+ "loss": 0.0026,
1744
+ "step": 241
1745
+ },
1746
+ {
1747
+ "epoch": 1.905511811023622,
1748
+ "grad_norm": 0.08451675623655319,
1749
+ "learning_rate": 6.163388667136646e-06,
1750
+ "loss": 0.0016,
1751
+ "step": 242
1752
+ },
1753
+ {
1754
+ "epoch": 1.9133858267716537,
1755
+ "grad_norm": 0.13671623170375824,
1756
+ "learning_rate": 6.085328259562195e-06,
1757
+ "loss": 0.0034,
1758
+ "step": 243
1759
+ },
1760
+ {
1761
+ "epoch": 1.9212598425196852,
1762
+ "grad_norm": 0.5500523447990417,
1763
+ "learning_rate": 6.007548553454379e-06,
1764
+ "loss": 0.0028,
1765
+ "step": 244
1766
+ },
1767
+ {
1768
+ "epoch": 1.9291338582677167,
1769
+ "grad_norm": 0.06702329218387604,
1770
+ "learning_rate": 5.93005512600563e-06,
1771
+ "loss": 0.0009,
1772
+ "step": 245
1773
+ },
1774
+ {
1775
+ "epoch": 1.9370078740157481,
1776
+ "grad_norm": 0.15156973898410797,
1777
+ "learning_rate": 5.852853533880768e-06,
1778
+ "loss": 0.0064,
1779
+ "step": 246
1780
+ },
1781
+ {
1782
+ "epoch": 1.9448818897637796,
1783
+ "grad_norm": 0.2970314621925354,
1784
+ "learning_rate": 5.7759493128185584e-06,
1785
+ "loss": 0.0077,
1786
+ "step": 247
1787
+ },
1788
+ {
1789
+ "epoch": 1.952755905511811,
1790
+ "grad_norm": 0.06406261771917343,
1791
+ "learning_rate": 5.699347977234799e-06,
1792
+ "loss": 0.0006,
1793
+ "step": 248
1794
+ },
1795
+ {
1796
+ "epoch": 1.9606299212598426,
1797
+ "grad_norm": 0.2910393178462982,
1798
+ "learning_rate": 5.623055019826862e-06,
1799
+ "loss": 0.0036,
1800
+ "step": 249
1801
+ },
1802
+ {
1803
+ "epoch": 1.968503937007874,
1804
+ "grad_norm": 0.6454993486404419,
1805
+ "learning_rate": 5.547075911179902e-06,
1806
+ "loss": 0.0084,
1807
+ "step": 250
1808
+ },
1809
+ {
1810
+ "epoch": 1.9763779527559056,
1811
+ "grad_norm": 0.09460143744945526,
1812
+ "learning_rate": 5.471416099374525e-06,
1813
+ "loss": 0.0021,
1814
+ "step": 251
1815
+ },
1816
+ {
1817
+ "epoch": 1.984251968503937,
1818
+ "grad_norm": 0.2024363875389099,
1819
+ "learning_rate": 5.3960810095961705e-06,
1820
+ "loss": 0.0052,
1821
+ "step": 252
1822
+ },
1823
+ {
1824
+ "epoch": 1.9921259842519685,
1825
+ "grad_norm": 0.09423142671585083,
1826
+ "learning_rate": 5.321076043746108e-06,
1827
+ "loss": 0.0018,
1828
+ "step": 253
1829
+ },
1830
+ {
1831
+ "epoch": 2.0,
1832
+ "grad_norm": 0.1085880920290947,
1833
+ "learning_rate": 5.246406580054051e-06,
1834
+ "loss": 0.0039,
1835
+ "step": 254
1836
+ },
1837
+ {
1838
+ "epoch": 2.0078740157480315,
1839
+ "grad_norm": 0.20550444722175598,
1840
+ "learning_rate": 5.172077972692553e-06,
1841
+ "loss": 0.0006,
1842
+ "step": 255
1843
+ },
1844
+ {
1845
+ "epoch": 2.015748031496063,
1846
+ "grad_norm": 0.0635254830121994,
1847
+ "learning_rate": 5.098095551393066e-06,
1848
+ "loss": 0.0008,
1849
+ "step": 256
1850
+ },
1851
+ {
1852
+ "epoch": 2.0236220472440944,
1853
+ "grad_norm": 0.12593789398670197,
1854
+ "learning_rate": 5.024464621063773e-06,
1855
+ "loss": 0.0016,
1856
+ "step": 257
1857
+ },
1858
+ {
1859
+ "epoch": 2.031496062992126,
1860
+ "grad_norm": 0.08928010612726212,
1861
+ "learning_rate": 4.951190461409214e-06,
1862
+ "loss": 0.0019,
1863
+ "step": 258
1864
+ },
1865
+ {
1866
+ "epoch": 2.031496062992126,
1867
+ "eval_loss": 0.2930968105792999,
1868
+ "eval_runtime": 7.0864,
1869
+ "eval_samples_per_second": 23.002,
1870
+ "eval_steps_per_second": 2.963,
1871
+ "step": 258
1872
+ },
1873
+ {
1874
+ "epoch": 2.0393700787401574,
1875
+ "grad_norm": 0.11555846035480499,
1876
+ "learning_rate": 4.878278326551682e-06,
1877
+ "loss": 0.0036,
1878
+ "step": 259
1879
+ },
1880
+ {
1881
+ "epoch": 2.047244094488189,
1882
+ "grad_norm": 0.11923055350780487,
1883
+ "learning_rate": 4.805733444654496e-06,
1884
+ "loss": 0.0011,
1885
+ "step": 260
1886
+ },
1887
+ {
1888
+ "epoch": 2.0551181102362204,
1889
+ "grad_norm": 0.5410908460617065,
1890
+ "learning_rate": 4.733561017547104e-06,
1891
+ "loss": 0.0065,
1892
+ "step": 261
1893
+ },
1894
+ {
1895
+ "epoch": 2.062992125984252,
1896
+ "grad_norm": 0.43598446249961853,
1897
+ "learning_rate": 4.661766220352098e-06,
1898
+ "loss": 0.004,
1899
+ "step": 262
1900
+ },
1901
+ {
1902
+ "epoch": 2.0708661417322833,
1903
+ "grad_norm": 0.08221737295389175,
1904
+ "learning_rate": 4.590354201114103e-06,
1905
+ "loss": 0.0018,
1906
+ "step": 263
1907
+ },
1908
+ {
1909
+ "epoch": 2.078740157480315,
1910
+ "grad_norm": 0.07835202664136887,
1911
+ "learning_rate": 4.519330080430687e-06,
1912
+ "loss": 0.0011,
1913
+ "step": 264
1914
+ },
1915
+ {
1916
+ "epoch": 2.0866141732283463,
1917
+ "grad_norm": 0.1391119360923767,
1918
+ "learning_rate": 4.448698951085143e-06,
1919
+ "loss": 0.0018,
1920
+ "step": 265
1921
+ },
1922
+ {
1923
+ "epoch": 2.094488188976378,
1924
+ "grad_norm": 0.10286661982536316,
1925
+ "learning_rate": 4.378465877681317e-06,
1926
+ "loss": 0.0021,
1927
+ "step": 266
1928
+ },
1929
+ {
1930
+ "epoch": 2.1023622047244093,
1931
+ "grad_norm": 0.16050903499126434,
1932
+ "learning_rate": 4.3086358962804885e-06,
1933
+ "loss": 0.004,
1934
+ "step": 267
1935
+ },
1936
+ {
1937
+ "epoch": 2.1102362204724407,
1938
+ "grad_norm": 0.1615462303161621,
1939
+ "learning_rate": 4.2392140140401996e-06,
1940
+ "loss": 0.0049,
1941
+ "step": 268
1942
+ },
1943
+ {
1944
+ "epoch": 2.1181102362204722,
1945
+ "grad_norm": 0.12022113800048828,
1946
+ "learning_rate": 4.170205208855281e-06,
1947
+ "loss": 0.0021,
1948
+ "step": 269
1949
+ },
1950
+ {
1951
+ "epoch": 2.1259842519685037,
1952
+ "grad_norm": 0.18673180043697357,
1953
+ "learning_rate": 4.101614429000857e-06,
1954
+ "loss": 0.0026,
1955
+ "step": 270
1956
+ },
1957
+ {
1958
+ "epoch": 2.1338582677165356,
1959
+ "grad_norm": 0.13400611281394958,
1960
+ "learning_rate": 4.033446592777558e-06,
1961
+ "loss": 0.0045,
1962
+ "step": 271
1963
+ },
1964
+ {
1965
+ "epoch": 2.141732283464567,
1966
+ "grad_norm": 0.08963260799646378,
1967
+ "learning_rate": 3.965706588158865e-06,
1968
+ "loss": 0.002,
1969
+ "step": 272
1970
+ },
1971
+ {
1972
+ "epoch": 2.1496062992125986,
1973
+ "grad_norm": 0.07362519204616547,
1974
+ "learning_rate": 3.89839927244058e-06,
1975
+ "loss": 0.0008,
1976
+ "step": 273
1977
+ },
1978
+ {
1979
+ "epoch": 2.15748031496063,
1980
+ "grad_norm": 0.12438540160655975,
1981
+ "learning_rate": 3.8315294718925656e-06,
1982
+ "loss": 0.0032,
1983
+ "step": 274
1984
+ },
1985
+ {
1986
+ "epoch": 2.1653543307086616,
1987
+ "grad_norm": 0.07505560666322708,
1988
+ "learning_rate": 3.7651019814126656e-06,
1989
+ "loss": 0.0011,
1990
+ "step": 275
1991
+ },
1992
+ {
1993
+ "epoch": 2.173228346456693,
1994
+ "grad_norm": 0.24100656807422638,
1995
+ "learning_rate": 3.6991215641828903e-06,
1996
+ "loss": 0.0039,
1997
+ "step": 276
1998
+ },
1999
+ {
2000
+ "epoch": 2.1811023622047245,
2001
+ "grad_norm": 0.08774268627166748,
2002
+ "learning_rate": 3.6335929513278667e-06,
2003
+ "loss": 0.0021,
2004
+ "step": 277
2005
+ },
2006
+ {
2007
+ "epoch": 2.188976377952756,
2008
+ "grad_norm": 0.06761056184768677,
2009
+ "learning_rate": 3.568520841575601e-06,
2010
+ "loss": 0.0004,
2011
+ "step": 278
2012
+ },
2013
+ {
2014
+ "epoch": 2.1968503937007875,
2015
+ "grad_norm": 0.514453113079071,
2016
+ "learning_rate": 3.5039099009205503e-06,
2017
+ "loss": 0.002,
2018
+ "step": 279
2019
+ },
2020
+ {
2021
+ "epoch": 2.204724409448819,
2022
+ "grad_norm": 0.1681102067232132,
2023
+ "learning_rate": 3.439764762289051e-06,
2024
+ "loss": 0.0049,
2025
+ "step": 280
2026
+ },
2027
+ {
2028
+ "epoch": 2.2125984251968505,
2029
+ "grad_norm": 0.46447646617889404,
2030
+ "learning_rate": 3.376090025207115e-06,
2031
+ "loss": 0.0037,
2032
+ "step": 281
2033
+ },
2034
+ {
2035
+ "epoch": 2.220472440944882,
2036
+ "grad_norm": 0.09738212823867798,
2037
+ "learning_rate": 3.312890255470609e-06,
2038
+ "loss": 0.0018,
2039
+ "step": 282
2040
+ },
2041
+ {
2042
+ "epoch": 2.2283464566929134,
2043
+ "grad_norm": 0.12760388851165771,
2044
+ "learning_rate": 3.250169984817897e-06,
2045
+ "loss": 0.0022,
2046
+ "step": 283
2047
+ },
2048
+ {
2049
+ "epoch": 2.236220472440945,
2050
+ "grad_norm": 0.05433168262243271,
2051
+ "learning_rate": 3.187933710604847e-06,
2052
+ "loss": 0.0005,
2053
+ "step": 284
2054
+ },
2055
+ {
2056
+ "epoch": 2.2440944881889764,
2057
+ "grad_norm": 0.06812359392642975,
2058
+ "learning_rate": 3.1261858954823798e-06,
2059
+ "loss": 0.0007,
2060
+ "step": 285
2061
+ },
2062
+ {
2063
+ "epoch": 2.251968503937008,
2064
+ "grad_norm": 0.44168326258659363,
2065
+ "learning_rate": 3.064930967076477e-06,
2066
+ "loss": 0.0052,
2067
+ "step": 286
2068
+ },
2069
+ {
2070
+ "epoch": 2.2598425196850394,
2071
+ "grad_norm": 0.4508403241634369,
2072
+ "learning_rate": 3.0041733176706668e-06,
2073
+ "loss": 0.0049,
2074
+ "step": 287
2075
+ },
2076
+ {
2077
+ "epoch": 2.267716535433071,
2078
+ "grad_norm": 0.00029889008146710694,
2079
+ "learning_rate": 2.943917303891107e-06,
2080
+ "loss": 0.0,
2081
+ "step": 288
2082
+ },
2083
+ {
2084
+ "epoch": 2.2755905511811023,
2085
+ "grad_norm": 0.16293245553970337,
2086
+ "learning_rate": 2.8841672463941827e-06,
2087
+ "loss": 0.0052,
2088
+ "step": 289
2089
+ },
2090
+ {
2091
+ "epoch": 2.283464566929134,
2092
+ "grad_norm": 0.0034355763345956802,
2093
+ "learning_rate": 2.8249274295566863e-06,
2094
+ "loss": 0.0,
2095
+ "step": 290
2096
+ },
2097
+ {
2098
+ "epoch": 2.2913385826771653,
2099
+ "grad_norm": 0.41321080923080444,
2100
+ "learning_rate": 2.766202101168628e-06,
2101
+ "loss": 0.0042,
2102
+ "step": 291
2103
+ },
2104
+ {
2105
+ "epoch": 2.2992125984251968,
2106
+ "grad_norm": 0.05302264913916588,
2107
+ "learning_rate": 2.7079954721286108e-06,
2108
+ "loss": 0.0008,
2109
+ "step": 292
2110
+ },
2111
+ {
2112
+ "epoch": 2.3070866141732282,
2113
+ "grad_norm": 0.16997075080871582,
2114
+ "learning_rate": 2.6503117161419246e-06,
2115
+ "loss": 0.0049,
2116
+ "step": 293
2117
+ },
2118
+ {
2119
+ "epoch": 2.3149606299212597,
2120
+ "grad_norm": 0.15489016473293304,
2121
+ "learning_rate": 2.5931549694212545e-06,
2122
+ "loss": 0.0029,
2123
+ "step": 294
2124
+ },
2125
+ {
2126
+ "epoch": 2.322834645669291,
2127
+ "grad_norm": 0.040922824293375015,
2128
+ "learning_rate": 2.536529330390095e-06,
2129
+ "loss": 0.0003,
2130
+ "step": 295
2131
+ },
2132
+ {
2133
+ "epoch": 2.3307086614173227,
2134
+ "grad_norm": 0.15096415579319,
2135
+ "learning_rate": 2.480438859388873e-06,
2136
+ "loss": 0.0037,
2137
+ "step": 296
2138
+ },
2139
+ {
2140
+ "epoch": 2.338582677165354,
2141
+ "grad_norm": 0.05358278378844261,
2142
+ "learning_rate": 2.424887578383799e-06,
2143
+ "loss": 0.0004,
2144
+ "step": 297
2145
+ },
2146
+ {
2147
+ "epoch": 2.3464566929133857,
2148
+ "grad_norm": 0.16193096339702606,
2149
+ "learning_rate": 2.36987947067848e-06,
2150
+ "loss": 0.0025,
2151
+ "step": 298
2152
+ },
2153
+ {
2154
+ "epoch": 2.354330708661417,
2155
+ "grad_norm": 0.10353274643421173,
2156
+ "learning_rate": 2.3154184806282863e-06,
2157
+ "loss": 0.0021,
2158
+ "step": 299
2159
+ },
2160
+ {
2161
+ "epoch": 2.362204724409449,
2162
+ "grad_norm": 0.10735179483890533,
2163
+ "learning_rate": 2.261508513357532e-06,
2164
+ "loss": 0.0035,
2165
+ "step": 300
2166
+ },
2167
+ {
2168
+ "epoch": 2.3700787401574805,
2169
+ "grad_norm": 0.18752367794513702,
2170
+ "learning_rate": 2.208153434479442e-06,
2171
+ "loss": 0.0039,
2172
+ "step": 301
2173
+ },
2174
+ {
2175
+ "epoch": 2.3700787401574805,
2176
+ "eval_loss": 0.30320534110069275,
2177
+ "eval_runtime": 6.5784,
2178
+ "eval_samples_per_second": 24.778,
2179
+ "eval_steps_per_second": 3.192,
2180
+ "step": 301
2181
+ },
2182
+ {
2183
+ "epoch": 2.377952755905512,
2184
+ "grad_norm": 0.13881297409534454,
2185
+ "learning_rate": 2.155357069818995e-06,
2186
+ "loss": 0.0032,
2187
+ "step": 302
2188
+ },
2189
+ {
2190
+ "epoch": 2.3858267716535435,
2191
+ "grad_norm": 0.09920285642147064,
2192
+ "learning_rate": 2.1031232051385606e-06,
2193
+ "loss": 0.0021,
2194
+ "step": 303
2195
+ },
2196
+ {
2197
+ "epoch": 2.393700787401575,
2198
+ "grad_norm": 0.37194201350212097,
2199
+ "learning_rate": 2.0514555858664663e-06,
2200
+ "loss": 0.0045,
2201
+ "step": 304
2202
+ },
2203
+ {
2204
+ "epoch": 2.4015748031496065,
2205
+ "grad_norm": 0.10560385882854462,
2206
+ "learning_rate": 2.000357916828428e-06,
2207
+ "loss": 0.0011,
2208
+ "step": 305
2209
+ },
2210
+ {
2211
+ "epoch": 2.409448818897638,
2212
+ "grad_norm": 0.33549824357032776,
2213
+ "learning_rate": 1.949833861981877e-06,
2214
+ "loss": 0.0039,
2215
+ "step": 306
2216
+ },
2217
+ {
2218
+ "epoch": 2.4173228346456694,
2219
+ "grad_norm": 0.3969619870185852,
2220
+ "learning_rate": 1.8998870441532569e-06,
2221
+ "loss": 0.0027,
2222
+ "step": 307
2223
+ },
2224
+ {
2225
+ "epoch": 2.425196850393701,
2226
+ "grad_norm": 0.081158846616745,
2227
+ "learning_rate": 1.8505210447782418e-06,
2228
+ "loss": 0.0011,
2229
+ "step": 308
2230
+ },
2231
+ {
2232
+ "epoch": 2.4330708661417324,
2233
+ "grad_norm": 0.28652095794677734,
2234
+ "learning_rate": 1.8017394036449276e-06,
2235
+ "loss": 0.0038,
2236
+ "step": 309
2237
+ },
2238
+ {
2239
+ "epoch": 2.440944881889764,
2240
+ "grad_norm": 0.0656951516866684,
2241
+ "learning_rate": 1.7535456186400123e-06,
2242
+ "loss": 0.001,
2243
+ "step": 310
2244
+ },
2245
+ {
2246
+ "epoch": 2.4488188976377954,
2247
+ "grad_norm": 0.14871421456336975,
2248
+ "learning_rate": 1.7059431454979825e-06,
2249
+ "loss": 0.0027,
2250
+ "step": 311
2251
+ },
2252
+ {
2253
+ "epoch": 2.456692913385827,
2254
+ "grad_norm": 0.25429457426071167,
2255
+ "learning_rate": 1.6589353975533174e-06,
2256
+ "loss": 0.0012,
2257
+ "step": 312
2258
+ },
2259
+ {
2260
+ "epoch": 2.4645669291338583,
2261
+ "grad_norm": 0.06939385086297989,
2262
+ "learning_rate": 1.6125257454957365e-06,
2263
+ "loss": 0.0008,
2264
+ "step": 313
2265
+ },
2266
+ {
2267
+ "epoch": 2.47244094488189,
2268
+ "grad_norm": 0.15781065821647644,
2269
+ "learning_rate": 1.5667175171285054e-06,
2270
+ "loss": 0.003,
2271
+ "step": 314
2272
+ },
2273
+ {
2274
+ "epoch": 2.4803149606299213,
2275
+ "grad_norm": 0.08229056000709534,
2276
+ "learning_rate": 1.5215139971298131e-06,
2277
+ "loss": 0.0015,
2278
+ "step": 315
2279
+ },
2280
+ {
2281
+ "epoch": 2.4881889763779528,
2282
+ "grad_norm": 0.16827985644340515,
2283
+ "learning_rate": 1.4769184268172465e-06,
2284
+ "loss": 0.0032,
2285
+ "step": 316
2286
+ },
2287
+ {
2288
+ "epoch": 2.4960629921259843,
2289
+ "grad_norm": 0.12261717021465302,
2290
+ "learning_rate": 1.4329340039153738e-06,
2291
+ "loss": 0.0022,
2292
+ "step": 317
2293
+ },
2294
+ {
2295
+ "epoch": 2.5039370078740157,
2296
+ "grad_norm": 0.1208304911851883,
2297
+ "learning_rate": 1.3895638823264447e-06,
2298
+ "loss": 0.002,
2299
+ "step": 318
2300
+ },
2301
+ {
2302
+ "epoch": 2.5118110236220472,
2303
+ "grad_norm": 0.22991932928562164,
2304
+ "learning_rate": 1.3468111719042497e-06,
2305
+ "loss": 0.0027,
2306
+ "step": 319
2307
+ },
2308
+ {
2309
+ "epoch": 2.5196850393700787,
2310
+ "grad_norm": 0.468462198972702,
2311
+ "learning_rate": 1.3046789382311132e-06,
2312
+ "loss": 0.0042,
2313
+ "step": 320
2314
+ },
2315
+ {
2316
+ "epoch": 2.52755905511811,
2317
+ "grad_norm": 0.029908303171396255,
2318
+ "learning_rate": 1.2631702023980997e-06,
2319
+ "loss": 0.0002,
2320
+ "step": 321
2321
+ },
2322
+ {
2323
+ "epoch": 2.5354330708661417,
2324
+ "grad_norm": 0.07678980380296707,
2325
+ "learning_rate": 1.2222879407883592e-06,
2326
+ "loss": 0.0014,
2327
+ "step": 322
2328
+ },
2329
+ {
2330
+ "epoch": 2.543307086614173,
2331
+ "grad_norm": 0.13547496497631073,
2332
+ "learning_rate": 1.182035084863724e-06,
2333
+ "loss": 0.0017,
2334
+ "step": 323
2335
+ },
2336
+ {
2337
+ "epoch": 2.5511811023622046,
2338
+ "grad_norm": 0.15075382590293884,
2339
+ "learning_rate": 1.1424145209545079e-06,
2340
+ "loss": 0.0059,
2341
+ "step": 324
2342
+ },
2343
+ {
2344
+ "epoch": 2.559055118110236,
2345
+ "grad_norm": 0.1271948516368866,
2346
+ "learning_rate": 1.1034290900525279e-06,
2347
+ "loss": 0.0021,
2348
+ "step": 325
2349
+ },
2350
+ {
2351
+ "epoch": 2.5669291338582676,
2352
+ "grad_norm": 0.11441997438669205,
2353
+ "learning_rate": 1.065081587607406e-06,
2354
+ "loss": 0.0022,
2355
+ "step": 326
2356
+ },
2357
+ {
2358
+ "epoch": 2.574803149606299,
2359
+ "grad_norm": 0.13326182961463928,
2360
+ "learning_rate": 1.0273747633261144e-06,
2361
+ "loss": 0.004,
2362
+ "step": 327
2363
+ },
2364
+ {
2365
+ "epoch": 2.5826771653543306,
2366
+ "grad_norm": 0.07804345339536667,
2367
+ "learning_rate": 9.903113209758098e-07,
2368
+ "loss": 0.0018,
2369
+ "step": 328
2370
+ },
2371
+ {
2372
+ "epoch": 2.590551181102362,
2373
+ "grad_norm": 0.0012728713918477297,
2374
+ "learning_rate": 9.538939181899565e-07,
2375
+ "loss": 0.0,
2376
+ "step": 329
2377
+ },
2378
+ {
2379
+ "epoch": 2.5984251968503935,
2380
+ "grad_norm": 0.06427028775215149,
2381
+ "learning_rate": 9.181251662777668e-07,
2382
+ "loss": 0.0007,
2383
+ "step": 330
2384
+ },
2385
+ {
2386
+ "epoch": 2.606299212598425,
2387
+ "grad_norm": 0.1923428475856781,
2388
+ "learning_rate": 8.830076300369517e-07,
2389
+ "loss": 0.006,
2390
+ "step": 331
2391
+ },
2392
+ {
2393
+ "epoch": 2.6141732283464565,
2394
+ "grad_norm": 0.33056169748306274,
2395
+ "learning_rate": 8.485438275698154e-07,
2396
+ "loss": 0.0024,
2397
+ "step": 332
2398
+ },
2399
+ {
2400
+ "epoch": 2.622047244094488,
2401
+ "grad_norm": 0.13692541420459747,
2402
+ "learning_rate": 8.14736230102694e-07,
2403
+ "loss": 0.0019,
2404
+ "step": 333
2405
+ },
2406
+ {
2407
+ "epoch": 2.6299212598425195,
2408
+ "grad_norm": 0.11543405055999756,
2409
+ "learning_rate": 7.815872618087506e-07,
2410
+ "loss": 0.003,
2411
+ "step": 334
2412
+ },
2413
+ {
2414
+ "epoch": 2.637795275590551,
2415
+ "grad_norm": 0.20871274173259735,
2416
+ "learning_rate": 7.490992996341662e-07,
2417
+ "loss": 0.0022,
2418
+ "step": 335
2419
+ },
2420
+ {
2421
+ "epoch": 2.6456692913385824,
2422
+ "grad_norm": 0.1506434828042984,
2423
+ "learning_rate": 7.17274673127677e-07,
2424
+ "loss": 0.0034,
2425
+ "step": 336
2426
+ },
2427
+ {
2428
+ "epoch": 2.653543307086614,
2429
+ "grad_norm": 0.1000061109662056,
2430
+ "learning_rate": 6.861156642735578e-07,
2431
+ "loss": 0.0015,
2432
+ "step": 337
2433
+ },
2434
+ {
2435
+ "epoch": 2.661417322834646,
2436
+ "grad_norm": 0.04730301722884178,
2437
+ "learning_rate": 6.556245073279777e-07,
2438
+ "loss": 0.0003,
2439
+ "step": 338
2440
+ },
2441
+ {
2442
+ "epoch": 2.6692913385826773,
2443
+ "grad_norm": 0.07712409645318985,
2444
+ "learning_rate": 6.258033886587911e-07,
2445
+ "loss": 0.0006,
2446
+ "step": 339
2447
+ },
2448
+ {
2449
+ "epoch": 2.677165354330709,
2450
+ "grad_norm": 0.12951001524925232,
2451
+ "learning_rate": 5.966544465887803e-07,
2452
+ "loss": 0.0022,
2453
+ "step": 340
2454
+ },
2455
+ {
2456
+ "epoch": 2.6850393700787403,
2457
+ "grad_norm": 0.3450707495212555,
2458
+ "learning_rate": 5.681797712423099e-07,
2459
+ "loss": 0.0031,
2460
+ "step": 341
2461
+ },
2462
+ {
2463
+ "epoch": 2.6929133858267718,
2464
+ "grad_norm": 0.11356323957443237,
2465
+ "learning_rate": 5.403814043954592e-07,
2466
+ "loss": 0.0016,
2467
+ "step": 342
2468
+ },
2469
+ {
2470
+ "epoch": 2.7007874015748032,
2471
+ "grad_norm": 0.40962764620780945,
2472
+ "learning_rate": 5.132613393296293e-07,
2473
+ "loss": 0.0022,
2474
+ "step": 343
2475
+ },
2476
+ {
2477
+ "epoch": 2.7086614173228347,
2478
+ "grad_norm": 0.0026160525158047676,
2479
+ "learning_rate": 4.868215206885918e-07,
2480
+ "loss": 0.0,
2481
+ "step": 344
2482
+ },
2483
+ {
2484
+ "epoch": 2.7086614173228347,
2485
+ "eval_loss": 0.3102666437625885,
2486
+ "eval_runtime": 7.3029,
2487
+ "eval_samples_per_second": 22.32,
2488
+ "eval_steps_per_second": 2.876,
2489
+ "step": 344
2490
+ },
2491
+ {
2492
+ "epoch": 2.716535433070866,
2493
+ "grad_norm": 0.2460733950138092,
2494
+ "learning_rate": 4.61063844339068e-07,
2495
+ "loss": 0.0044,
2496
+ "step": 345
2497
+ },
2498
+ {
2499
+ "epoch": 2.7244094488188977,
2500
+ "grad_norm": 0.11104279011487961,
2501
+ "learning_rate": 4.359901572347758e-07,
2502
+ "loss": 0.0031,
2503
+ "step": 346
2504
+ },
2505
+ {
2506
+ "epoch": 2.732283464566929,
2507
+ "grad_norm": 0.288809210062027,
2508
+ "learning_rate": 4.116022572839984e-07,
2509
+ "loss": 0.0023,
2510
+ "step": 347
2511
+ },
2512
+ {
2513
+ "epoch": 2.7401574803149606,
2514
+ "grad_norm": 0.2904239892959595,
2515
+ "learning_rate": 3.879018932206624e-07,
2516
+ "loss": 0.001,
2517
+ "step": 348
2518
+ },
2519
+ {
2520
+ "epoch": 2.748031496062992,
2521
+ "grad_norm": 0.5172310471534729,
2522
+ "learning_rate": 3.6489076447894456e-07,
2523
+ "loss": 0.0023,
2524
+ "step": 349
2525
+ },
2526
+ {
2527
+ "epoch": 2.7559055118110236,
2528
+ "grad_norm": 0.555241048336029,
2529
+ "learning_rate": 3.425705210714192e-07,
2530
+ "loss": 0.0026,
2531
+ "step": 350
2532
+ },
2533
+ {
2534
+ "epoch": 2.763779527559055,
2535
+ "grad_norm": 0.12381427735090256,
2536
+ "learning_rate": 3.2094276347073626e-07,
2537
+ "loss": 0.002,
2538
+ "step": 351
2539
+ },
2540
+ {
2541
+ "epoch": 2.7716535433070866,
2542
+ "grad_norm": 0.16744810342788696,
2543
+ "learning_rate": 3.000090424948665e-07,
2544
+ "loss": 0.0036,
2545
+ "step": 352
2546
+ },
2547
+ {
2548
+ "epoch": 2.779527559055118,
2549
+ "grad_norm": 0.512416422367096,
2550
+ "learning_rate": 2.7977085919589253e-07,
2551
+ "loss": 0.0026,
2552
+ "step": 353
2553
+ },
2554
+ {
2555
+ "epoch": 2.7874015748031495,
2556
+ "grad_norm": 0.18864978849887848,
2557
+ "learning_rate": 2.602296647523861e-07,
2558
+ "loss": 0.0025,
2559
+ "step": 354
2560
+ },
2561
+ {
2562
+ "epoch": 2.795275590551181,
2563
+ "grad_norm": 0.012189923785626888,
2564
+ "learning_rate": 2.413868603653413e-07,
2565
+ "loss": 0.0001,
2566
+ "step": 355
2567
+ },
2568
+ {
2569
+ "epoch": 2.8031496062992125,
2570
+ "grad_norm": 0.10027037560939789,
2571
+ "learning_rate": 2.2324379715770728e-07,
2572
+ "loss": 0.0011,
2573
+ "step": 356
2574
+ },
2575
+ {
2576
+ "epoch": 2.811023622047244,
2577
+ "grad_norm": 0.10117685794830322,
2578
+ "learning_rate": 2.0580177607750663e-07,
2579
+ "loss": 0.0036,
2580
+ "step": 357
2581
+ },
2582
+ {
2583
+ "epoch": 2.8188976377952755,
2584
+ "grad_norm": 0.1535252332687378,
2585
+ "learning_rate": 1.890620478045435e-07,
2586
+ "loss": 0.0044,
2587
+ "step": 358
2588
+ },
2589
+ {
2590
+ "epoch": 2.826771653543307,
2591
+ "grad_norm": 0.39140409231185913,
2592
+ "learning_rate": 1.7302581266073537e-07,
2593
+ "loss": 0.0037,
2594
+ "step": 359
2595
+ },
2596
+ {
2597
+ "epoch": 2.8346456692913384,
2598
+ "grad_norm": 0.18143348395824432,
2599
+ "learning_rate": 1.5769422052403172e-07,
2600
+ "loss": 0.0033,
2601
+ "step": 360
2602
+ },
2603
+ {
2604
+ "epoch": 2.84251968503937,
2605
+ "grad_norm": 0.6282801032066345,
2606
+ "learning_rate": 1.4306837074597235e-07,
2607
+ "loss": 0.0096,
2608
+ "step": 361
2609
+ },
2610
+ {
2611
+ "epoch": 2.850393700787402,
2612
+ "grad_norm": 0.3672868311405182,
2613
+ "learning_rate": 1.2914931207285154e-07,
2614
+ "loss": 0.0014,
2615
+ "step": 362
2616
+ },
2617
+ {
2618
+ "epoch": 2.8582677165354333,
2619
+ "grad_norm": 0.13403712213039398,
2620
+ "learning_rate": 1.1593804257052143e-07,
2621
+ "loss": 0.0046,
2622
+ "step": 363
2623
+ },
2624
+ {
2625
+ "epoch": 2.866141732283465,
2626
+ "grad_norm": 0.004047819878906012,
2627
+ "learning_rate": 1.0343550955282278e-07,
2628
+ "loss": 0.0,
2629
+ "step": 364
2630
+ },
2631
+ {
2632
+ "epoch": 2.8740157480314963,
2633
+ "grad_norm": 0.3351942002773285,
2634
+ "learning_rate": 9.164260951366021e-08,
2635
+ "loss": 0.0024,
2636
+ "step": 365
2637
+ },
2638
+ {
2639
+ "epoch": 2.8818897637795278,
2640
+ "grad_norm": 0.09759978204965591,
2641
+ "learning_rate": 8.056018806271937e-08,
2642
+ "loss": 0.002,
2643
+ "step": 366
2644
+ },
2645
+ {
2646
+ "epoch": 2.8897637795275593,
2647
+ "grad_norm": 0.06213594600558281,
2648
+ "learning_rate": 7.018903986483083e-08,
2649
+ "loss": 0.0009,
2650
+ "step": 367
2651
+ },
2652
+ {
2653
+ "epoch": 2.8976377952755907,
2654
+ "grad_norm": 0.07074209302663803,
2655
+ "learning_rate": 6.052990858298801e-08,
2656
+ "loss": 0.0009,
2657
+ "step": 368
2658
+ },
2659
+ {
2660
+ "epoch": 2.905511811023622,
2661
+ "grad_norm": 0.271335631608963,
2662
+ "learning_rate": 5.158348682502756e-08,
2663
+ "loss": 0.0037,
2664
+ "step": 369
2665
+ },
2666
+ {
2667
+ "epoch": 2.9133858267716537,
2668
+ "grad_norm": 0.09063868969678879,
2669
+ "learning_rate": 4.335041609396018e-08,
2670
+ "loss": 0.0014,
2671
+ "step": 370
2672
+ },
2673
+ {
2674
+ "epoch": 2.921259842519685,
2675
+ "grad_norm": 0.818594753742218,
2676
+ "learning_rate": 3.5831286741973006e-08,
2677
+ "loss": 0.0033,
2678
+ "step": 371
2679
+ },
2680
+ {
2681
+ "epoch": 2.9291338582677167,
2682
+ "grad_norm": 0.09543661028146744,
2683
+ "learning_rate": 2.902663792810012e-08,
2684
+ "loss": 0.0015,
2685
+ "step": 372
2686
+ },
2687
+ {
2688
+ "epoch": 2.937007874015748,
2689
+ "grad_norm": 0.13098907470703125,
2690
+ "learning_rate": 2.293695757956571e-08,
2691
+ "loss": 0.0037,
2692
+ "step": 373
2693
+ },
2694
+ {
2695
+ "epoch": 2.9448818897637796,
2696
+ "grad_norm": 0.5491423010826111,
2697
+ "learning_rate": 1.7562682356786488e-08,
2698
+ "loss": 0.004,
2699
+ "step": 374
2700
+ },
2701
+ {
2702
+ "epoch": 2.952755905511811,
2703
+ "grad_norm": 0.08357255905866623,
2704
+ "learning_rate": 1.290419762207007e-08,
2705
+ "loss": 0.0015,
2706
+ "step": 375
2707
+ },
2708
+ {
2709
+ "epoch": 2.9606299212598426,
2710
+ "grad_norm": 0.24269114434719086,
2711
+ "learning_rate": 8.961837411982643e-09,
2712
+ "loss": 0.0028,
2713
+ "step": 376
2714
+ },
2715
+ {
2716
+ "epoch": 2.968503937007874,
2717
+ "grad_norm": 0.1084604412317276,
2718
+ "learning_rate": 5.735884413391457e-09,
2719
+ "loss": 0.0022,
2720
+ "step": 377
2721
+ },
2722
+ {
2723
+ "epoch": 2.9763779527559056,
2724
+ "grad_norm": 0.09172981232404709,
2725
+ "learning_rate": 3.226569943197699e-09,
2726
+ "loss": 0.0022,
2727
+ "step": 378
2728
+ },
2729
+ {
2730
+ "epoch": 2.984251968503937,
2731
+ "grad_norm": 0.1312946230173111,
2732
+ "learning_rate": 1.4340739317497688e-09,
2733
+ "loss": 0.002,
2734
+ "step": 379
2735
+ },
2736
+ {
2737
+ "epoch": 2.9921259842519685,
2738
+ "grad_norm": 0.0002817026397679001,
2739
+ "learning_rate": 3.585249099435917e-10,
2740
+ "loss": 0.0,
2741
+ "step": 380
2742
+ },
2743
+ {
2744
+ "epoch": 3.0,
2745
+ "grad_norm": 0.2318553477525711,
2746
+ "learning_rate": 0.0,
2747
+ "loss": 0.0007,
2748
+ "step": 381
2749
+ }
2750
+ ],
2751
+ "logging_steps": 1,
2752
+ "max_steps": 381,
2753
+ "num_input_tokens_seen": 0,
2754
+ "num_train_epochs": 3,
2755
+ "save_steps": 127,
2756
+ "stateful_callbacks": {
2757
+ "TrainerControl": {
2758
+ "args": {
2759
+ "should_epoch_stop": false,
2760
+ "should_evaluate": false,
2761
+ "should_log": false,
2762
+ "should_save": true,
2763
+ "should_training_stop": true
2764
+ },
2765
+ "attributes": {}
2766
+ }
2767
+ },
2768
+ "total_flos": 3.9261813209667994e+17,
2769
+ "train_batch_size": 128,
2770
+ "trial_name": null,
2771
+ "trial_params": null
2772
+ }
checkpoint-381/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:033fc2cc0303528d4e9ad523b3fd63b75e963b86dba301044379df1d98e6c394
3
+ size 10744
checkpoint-381/vocab.json ADDED
The diff for this file is too large to render. See raw diff