ticoAg commited on
Commit
0ddfe75
1 Parent(s): 1577eae

qwen2-7b-instruct-traininfer-instruct0625

Browse files
Files changed (21) hide show
  1. qwen2-7b-instruct-traininfer-instruct0625/.DS_Store +0 -0
  2. qwen2-7b-instruct-traininfer-instruct0625/ckpt/README.md +75 -0
  3. qwen2-7b-instruct-traininfer-instruct0625/ckpt/adapter_config.json +34 -0
  4. qwen2-7b-instruct-traininfer-instruct0625/ckpt/adapter_model.safetensors +3 -0
  5. qwen2-7b-instruct-traininfer-instruct0625/ckpt/added_tokens.json +5 -0
  6. qwen2-7b-instruct-traininfer-instruct0625/ckpt/all_results.json +12 -0
  7. qwen2-7b-instruct-traininfer-instruct0625/ckpt/eval_results.json +7 -0
  8. qwen2-7b-instruct-traininfer-instruct0625/ckpt/merges.txt +0 -0
  9. qwen2-7b-instruct-traininfer-instruct0625/ckpt/special_tokens_map.json +20 -0
  10. qwen2-7b-instruct-traininfer-instruct0625/ckpt/tokenizer.json +0 -0
  11. qwen2-7b-instruct-traininfer-instruct0625/ckpt/tokenizer_config.json +44 -0
  12. qwen2-7b-instruct-traininfer-instruct0625/ckpt/train_results.json +8 -0
  13. qwen2-7b-instruct-traininfer-instruct0625/ckpt/trainer_log.jsonl +198 -0
  14. qwen2-7b-instruct-traininfer-instruct0625/ckpt/trainer_state.json +1430 -0
  15. qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_args.bin +3 -0
  16. qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_eval_loss.png +0 -0
  17. qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_loss.png +0 -0
  18. qwen2-7b-instruct-traininfer-instruct0625/ckpt/vocab.json +0 -0
  19. qwen2-7b-instruct-traininfer-instruct0625/merge_weight.py +20 -0
  20. qwen2-7b-instruct-traininfer-instruct0625/qwen2_7b_instruct_lora_sft.yaml +41 -0
  21. qwen2-7b-instruct-traininfer-instruct0625/readme.md +29 -0
qwen2-7b-instruct-traininfer-instruct0625/.DS_Store ADDED
Binary file (6.15 kB). View file
 
qwen2-7b-instruct-traininfer-instruct0625/ckpt/README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /data/modelscope/qwen/Qwen2-7B-Instruct
3
+ library_name: peft
4
+ license: other
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: sft
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # sft
18
+
19
+ This model is a fine-tuned version of [/data/modelscope/qwen/Qwen2-7B-Instruct](https://huggingface.co//data/modelscope/qwen/Qwen2-7B-Instruct) on the llm-complex-reasoning-train-qwen2-72b-instruct-correct and the Infinity-Instruct-0625 datasets.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.9800
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 4
46
+ - gradient_accumulation_steps: 8
47
+ - total_train_batch_size: 256
48
+ - total_eval_batch_size: 16
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 2.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.0282 | 0.2122 | 100 | 1.0028 |
59
+ | 1.0122 | 0.4244 | 200 | 0.9917 |
60
+ | 0.9884 | 0.6366 | 300 | 0.9869 |
61
+ | 0.9771 | 0.8488 | 400 | 0.9841 |
62
+ | 0.9974 | 1.0610 | 500 | 0.9823 |
63
+ | 0.9934 | 1.2732 | 600 | 0.9813 |
64
+ | 0.9738 | 1.4854 | 700 | 0.9805 |
65
+ | 0.9744 | 1.6976 | 800 | 0.9801 |
66
+ | 0.9887 | 1.9098 | 900 | 0.9800 |
67
+
68
+
69
+ ### Framework versions
70
+
71
+ - PEFT 0.12.0
72
+ - Transformers 4.43.4
73
+ - Pytorch 2.2.1
74
+ - Datasets 2.18.0
75
+ - Tokenizers 0.19.1
qwen2-7b-instruct-traininfer-instruct0625/ckpt/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data/modelscope/qwen/Qwen2-7B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "o_proj",
28
+ "q_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da96cc84dffeb7d158a55d29722c630fa44df726432634d7bc2cdcc5b317112
3
+ size 40422208
qwen2-7b-instruct-traininfer-instruct0625/ckpt/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9989389920424403,
3
+ "eval_loss": 0.9800044894218445,
4
+ "eval_runtime": 502.5364,
5
+ "eval_samples_per_second": 26.673,
6
+ "eval_steps_per_second": 1.668,
7
+ "total_flos": 1.4259794694102843e+19,
8
+ "train_loss": 0.9894429036513003,
9
+ "train_runtime": 36869.1128,
10
+ "train_samples_per_second": 6.544,
11
+ "train_steps_per_second": 0.026
12
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9989389920424403,
3
+ "eval_loss": 0.9800044894218445,
4
+ "eval_runtime": 502.5364,
5
+ "eval_samples_per_second": 26.673,
6
+ "eval_steps_per_second": 1.668
7
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2-7b-instruct-traininfer-instruct0625/ckpt/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2-7b-instruct-traininfer-instruct0625/ckpt/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 131072,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9989389920424403,
3
+ "total_flos": 1.4259794694102843e+19,
4
+ "train_loss": 0.9894429036513003,
5
+ "train_runtime": 36869.1128,
6
+ "train_samples_per_second": 6.544,
7
+ "train_steps_per_second": 0.026
8
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/trainer_log.jsonl ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 942, "loss": 1.0589, "learning_rate": 2.631578947368421e-06, "epoch": 0.010610079575596816, "percentage": 0.53, "elapsed_time": "0:02:49", "remaining_time": "8:48:02", "throughput": "0.00", "total_tokens": 0}
2
+ {"current_steps": 10, "total_steps": 942, "loss": 1.0432, "learning_rate": 5.263157894736842e-06, "epoch": 0.021220159151193633, "percentage": 1.06, "elapsed_time": "0:05:35", "remaining_time": "8:40:38", "throughput": "0.00", "total_tokens": 0}
3
+ {"current_steps": 15, "total_steps": 942, "loss": 1.0265, "learning_rate": 7.894736842105263e-06, "epoch": 0.03183023872679045, "percentage": 1.59, "elapsed_time": "0:08:29", "remaining_time": "8:44:56", "throughput": "0.00", "total_tokens": 0}
4
+ {"current_steps": 20, "total_steps": 942, "loss": 1.0666, "learning_rate": 1.0526315789473684e-05, "epoch": 0.042440318302387266, "percentage": 2.12, "elapsed_time": "0:11:14", "remaining_time": "8:38:36", "throughput": "0.00", "total_tokens": 0}
5
+ {"current_steps": 25, "total_steps": 942, "loss": 1.0659, "learning_rate": 1.3157894736842106e-05, "epoch": 0.05305039787798409, "percentage": 2.65, "elapsed_time": "0:13:59", "remaining_time": "8:33:21", "throughput": "0.00", "total_tokens": 0}
6
+ {"current_steps": 30, "total_steps": 942, "loss": 1.0403, "learning_rate": 1.5789473684210526e-05, "epoch": 0.0636604774535809, "percentage": 3.18, "elapsed_time": "0:16:48", "remaining_time": "8:31:05", "throughput": "0.00", "total_tokens": 0}
7
+ {"current_steps": 35, "total_steps": 942, "loss": 1.0297, "learning_rate": 1.8421052631578947e-05, "epoch": 0.07427055702917772, "percentage": 3.72, "elapsed_time": "0:19:36", "remaining_time": "8:28:02", "throughput": "0.00", "total_tokens": 0}
8
+ {"current_steps": 40, "total_steps": 942, "loss": 1.0295, "learning_rate": 2.105263157894737e-05, "epoch": 0.08488063660477453, "percentage": 4.25, "elapsed_time": "0:22:35", "remaining_time": "8:29:23", "throughput": "0.00", "total_tokens": 0}
9
+ {"current_steps": 45, "total_steps": 942, "loss": 1.0332, "learning_rate": 2.368421052631579e-05, "epoch": 0.09549071618037135, "percentage": 4.78, "elapsed_time": "0:25:27", "remaining_time": "8:27:26", "throughput": "0.00", "total_tokens": 0}
10
+ {"current_steps": 50, "total_steps": 942, "loss": 1.0529, "learning_rate": 2.6315789473684212e-05, "epoch": 0.10610079575596817, "percentage": 5.31, "elapsed_time": "0:28:18", "remaining_time": "8:25:09", "throughput": "0.00", "total_tokens": 0}
11
+ {"current_steps": 55, "total_steps": 942, "loss": 1.0286, "learning_rate": 2.8947368421052634e-05, "epoch": 0.11671087533156499, "percentage": 5.84, "elapsed_time": "0:31:01", "remaining_time": "8:20:15", "throughput": "0.00", "total_tokens": 0}
12
+ {"current_steps": 60, "total_steps": 942, "loss": 1.0347, "learning_rate": 3.157894736842105e-05, "epoch": 0.1273209549071618, "percentage": 6.37, "elapsed_time": "0:33:58", "remaining_time": "8:19:24", "throughput": "0.00", "total_tokens": 0}
13
+ {"current_steps": 65, "total_steps": 942, "loss": 1.0234, "learning_rate": 3.421052631578947e-05, "epoch": 0.13793103448275862, "percentage": 6.9, "elapsed_time": "0:36:46", "remaining_time": "8:16:06", "throughput": "0.00", "total_tokens": 0}
14
+ {"current_steps": 70, "total_steps": 942, "loss": 1.0313, "learning_rate": 3.6842105263157895e-05, "epoch": 0.14854111405835543, "percentage": 7.43, "elapsed_time": "0:39:41", "remaining_time": "8:14:31", "throughput": "0.00", "total_tokens": 0}
15
+ {"current_steps": 75, "total_steps": 942, "loss": 1.0114, "learning_rate": 3.9473684210526316e-05, "epoch": 0.15915119363395225, "percentage": 7.96, "elapsed_time": "0:42:29", "remaining_time": "8:11:12", "throughput": "0.00", "total_tokens": 0}
16
+ {"current_steps": 80, "total_steps": 942, "loss": 1.0158, "learning_rate": 4.210526315789474e-05, "epoch": 0.16976127320954906, "percentage": 8.49, "elapsed_time": "0:45:25", "remaining_time": "8:09:24", "throughput": "0.00", "total_tokens": 0}
17
+ {"current_steps": 85, "total_steps": 942, "loss": 1.0187, "learning_rate": 4.473684210526316e-05, "epoch": 0.18037135278514588, "percentage": 9.02, "elapsed_time": "0:48:14", "remaining_time": "8:06:27", "throughput": "0.00", "total_tokens": 0}
18
+ {"current_steps": 90, "total_steps": 942, "loss": 1.0084, "learning_rate": 4.736842105263158e-05, "epoch": 0.1909814323607427, "percentage": 9.55, "elapsed_time": "0:51:04", "remaining_time": "8:03:33", "throughput": "0.00", "total_tokens": 0}
19
+ {"current_steps": 95, "total_steps": 942, "loss": 1.0017, "learning_rate": 5e-05, "epoch": 0.20159151193633953, "percentage": 10.08, "elapsed_time": "0:53:58", "remaining_time": "8:01:17", "throughput": "0.00", "total_tokens": 0}
20
+ {"current_steps": 100, "total_steps": 942, "loss": 1.0282, "learning_rate": 4.999570096976961e-05, "epoch": 0.21220159151193635, "percentage": 10.62, "elapsed_time": "0:56:56", "remaining_time": "7:59:25", "throughput": "0.00", "total_tokens": 0}
21
+ {"current_steps": 100, "total_steps": 942, "eval_loss": 1.0028369426727295, "epoch": 0.21220159151193635, "percentage": 10.62, "elapsed_time": "1:05:17", "remaining_time": "9:09:47", "throughput": "0.00", "total_tokens": 0}
22
+ {"current_steps": 105, "total_steps": 942, "loss": 0.9892, "learning_rate": 4.998280535761132e-05, "epoch": 0.22281167108753316, "percentage": 11.15, "elapsed_time": "1:08:13", "remaining_time": "9:03:54", "throughput": "0.00", "total_tokens": 0}
23
+ {"current_steps": 110, "total_steps": 942, "loss": 1.0344, "learning_rate": 4.996131759861523e-05, "epoch": 0.23342175066312998, "percentage": 11.68, "elapsed_time": "1:11:03", "remaining_time": "8:57:26", "throughput": "0.00", "total_tokens": 0}
24
+ {"current_steps": 115, "total_steps": 942, "loss": 0.9892, "learning_rate": 4.99312450829034e-05, "epoch": 0.2440318302387268, "percentage": 12.21, "elapsed_time": "1:13:56", "remaining_time": "8:51:44", "throughput": "0.00", "total_tokens": 0}
25
+ {"current_steps": 120, "total_steps": 942, "loss": 1.0061, "learning_rate": 4.989259815308815e-05, "epoch": 0.2546419098143236, "percentage": 12.74, "elapsed_time": "1:16:45", "remaining_time": "8:45:46", "throughput": "0.00", "total_tokens": 0}
26
+ {"current_steps": 125, "total_steps": 942, "loss": 1.0031, "learning_rate": 4.984539010071506e-05, "epoch": 0.26525198938992045, "percentage": 13.27, "elapsed_time": "1:19:43", "remaining_time": "8:41:07", "throughput": "0.00", "total_tokens": 0}
27
+ {"current_steps": 130, "total_steps": 942, "loss": 0.9758, "learning_rate": 4.978963716169166e-05, "epoch": 0.27586206896551724, "percentage": 13.8, "elapsed_time": "1:22:39", "remaining_time": "8:36:16", "throughput": "0.00", "total_tokens": 0}
28
+ {"current_steps": 135, "total_steps": 942, "loss": 1.0091, "learning_rate": 4.972535851070358e-05, "epoch": 0.2864721485411141, "percentage": 14.33, "elapsed_time": "1:25:36", "remaining_time": "8:31:44", "throughput": "0.00", "total_tokens": 0}
29
+ {"current_steps": 140, "total_steps": 942, "loss": 0.9945, "learning_rate": 4.965257625461992e-05, "epoch": 0.29708222811671087, "percentage": 14.86, "elapsed_time": "1:28:29", "remaining_time": "8:26:57", "throughput": "0.00", "total_tokens": 0}
30
+ {"current_steps": 145, "total_steps": 942, "loss": 0.9934, "learning_rate": 4.957131542489021e-05, "epoch": 0.3076923076923077, "percentage": 15.39, "elapsed_time": "1:31:15", "remaining_time": "8:21:33", "throughput": "0.00", "total_tokens": 0}
31
+ {"current_steps": 150, "total_steps": 942, "loss": 1.0224, "learning_rate": 4.948160396893553e-05, "epoch": 0.3183023872679045, "percentage": 15.92, "elapsed_time": "1:34:03", "remaining_time": "8:16:36", "throughput": "0.00", "total_tokens": 0}
32
+ {"current_steps": 155, "total_steps": 942, "loss": 0.9976, "learning_rate": 4.9383472740536785e-05, "epoch": 0.32891246684350134, "percentage": 16.45, "elapsed_time": "1:37:00", "remaining_time": "8:12:30", "throughput": "0.00", "total_tokens": 0}
33
+ {"current_steps": 160, "total_steps": 942, "loss": 1.0121, "learning_rate": 4.927695548922335e-05, "epoch": 0.3395225464190981, "percentage": 16.99, "elapsed_time": "1:39:54", "remaining_time": "8:08:19", "throughput": "0.00", "total_tokens": 0}
34
+ {"current_steps": 165, "total_steps": 942, "loss": 1.0045, "learning_rate": 4.916208884866593e-05, "epoch": 0.35013262599469497, "percentage": 17.52, "elapsed_time": "1:42:42", "remaining_time": "8:03:41", "throughput": "0.00", "total_tokens": 0}
35
+ {"current_steps": 170, "total_steps": 942, "loss": 1.0037, "learning_rate": 4.9038912324077315e-05, "epoch": 0.36074270557029176, "percentage": 18.05, "elapsed_time": "1:45:40", "remaining_time": "7:59:54", "throughput": "0.00", "total_tokens": 0}
36
+ {"current_steps": 175, "total_steps": 942, "loss": 1.0125, "learning_rate": 4.8907468278625747e-05, "epoch": 0.3713527851458886, "percentage": 18.58, "elapsed_time": "1:48:32", "remaining_time": "7:55:42", "throughput": "0.00", "total_tokens": 0}
37
+ {"current_steps": 180, "total_steps": 942, "loss": 1.0041, "learning_rate": 4.876780191886523e-05, "epoch": 0.3819628647214854, "percentage": 19.11, "elapsed_time": "1:51:15", "remaining_time": "7:50:58", "throughput": "0.00", "total_tokens": 0}
38
+ {"current_steps": 185, "total_steps": 942, "loss": 0.9925, "learning_rate": 4.861996127918798e-05, "epoch": 0.3925729442970822, "percentage": 19.64, "elapsed_time": "1:54:06", "remaining_time": "7:46:54", "throughput": "0.00", "total_tokens": 0}
39
+ {"current_steps": 190, "total_steps": 942, "loss": 0.9977, "learning_rate": 4.846399720530434e-05, "epoch": 0.40318302387267907, "percentage": 20.17, "elapsed_time": "1:56:55", "remaining_time": "7:42:46", "throughput": "0.00", "total_tokens": 0}
40
+ {"current_steps": 195, "total_steps": 942, "loss": 0.99, "learning_rate": 4.8299963336755784e-05, "epoch": 0.41379310344827586, "percentage": 20.7, "elapsed_time": "1:59:46", "remaining_time": "7:38:48", "throughput": "0.00", "total_tokens": 0}
41
+ {"current_steps": 200, "total_steps": 942, "loss": 1.0122, "learning_rate": 4.81279160884671e-05, "epoch": 0.4244031830238727, "percentage": 21.23, "elapsed_time": "2:02:30", "remaining_time": "7:34:30", "throughput": "0.00", "total_tokens": 0}
42
+ {"current_steps": 200, "total_steps": 942, "eval_loss": 0.9916980266571045, "epoch": 0.4244031830238727, "percentage": 21.23, "elapsed_time": "2:10:52", "remaining_time": "8:05:31", "throughput": "0.00", "total_tokens": 0}
43
+ {"current_steps": 205, "total_steps": 942, "loss": 0.9895, "learning_rate": 4.794791463134399e-05, "epoch": 0.4350132625994695, "percentage": 21.76, "elapsed_time": "2:13:49", "remaining_time": "8:01:06", "throughput": "0.00", "total_tokens": 0}
44
+ {"current_steps": 210, "total_steps": 942, "loss": 1.0012, "learning_rate": 4.7760020871922914e-05, "epoch": 0.44562334217506633, "percentage": 22.29, "elapsed_time": "2:16:37", "remaining_time": "7:56:14", "throughput": "0.00", "total_tokens": 0}
45
+ {"current_steps": 215, "total_steps": 942, "loss": 0.9883, "learning_rate": 4.7564299431080016e-05, "epoch": 0.4562334217506631, "percentage": 22.82, "elapsed_time": "2:19:33", "remaining_time": "7:51:55", "throughput": "0.00", "total_tokens": 0}
46
+ {"current_steps": 220, "total_steps": 942, "loss": 1.0064, "learning_rate": 4.736081762180658e-05, "epoch": 0.46684350132625996, "percentage": 23.35, "elapsed_time": "2:22:20", "remaining_time": "7:47:08", "throughput": "0.00", "total_tokens": 0}
47
+ {"current_steps": 225, "total_steps": 942, "loss": 1.0064, "learning_rate": 4.714964542605855e-05, "epoch": 0.47745358090185674, "percentage": 23.89, "elapsed_time": "2:25:11", "remaining_time": "7:42:41", "throughput": "0.00", "total_tokens": 0}
48
+ {"current_steps": 230, "total_steps": 942, "loss": 0.9942, "learning_rate": 4.69308554706882e-05, "epoch": 0.4880636604774536, "percentage": 24.42, "elapsed_time": "2:28:03", "remaining_time": "7:38:19", "throughput": "0.00", "total_tokens": 0}
49
+ {"current_steps": 235, "total_steps": 942, "loss": 0.9857, "learning_rate": 4.67045230024661e-05, "epoch": 0.4986737400530504, "percentage": 24.95, "elapsed_time": "2:30:57", "remaining_time": "7:34:09", "throughput": "0.00", "total_tokens": 0}
50
+ {"current_steps": 240, "total_steps": 942, "loss": 0.9769, "learning_rate": 4.64707258622021e-05, "epoch": 0.5092838196286472, "percentage": 25.48, "elapsed_time": "2:33:49", "remaining_time": "7:29:56", "throughput": "0.00", "total_tokens": 0}
51
+ {"current_steps": 245, "total_steps": 942, "loss": 0.9993, "learning_rate": 4.622954445797409e-05, "epoch": 0.519893899204244, "percentage": 26.01, "elapsed_time": "2:36:38", "remaining_time": "7:25:38", "throughput": "0.00", "total_tokens": 0}
52
+ {"current_steps": 250, "total_steps": 942, "loss": 0.9958, "learning_rate": 4.5981061737473904e-05, "epoch": 0.5305039787798409, "percentage": 26.54, "elapsed_time": "2:39:30", "remaining_time": "7:21:31", "throughput": "0.00", "total_tokens": 0}
53
+ {"current_steps": 255, "total_steps": 942, "loss": 0.9801, "learning_rate": 4.572536315947971e-05, "epoch": 0.5411140583554377, "percentage": 27.07, "elapsed_time": "2:42:16", "remaining_time": "7:17:12", "throughput": "0.00", "total_tokens": 0}
54
+ {"current_steps": 260, "total_steps": 942, "loss": 0.9937, "learning_rate": 4.546253666446484e-05, "epoch": 0.5517241379310345, "percentage": 27.6, "elapsed_time": "2:45:12", "remaining_time": "7:13:21", "throughput": "0.00", "total_tokens": 0}
55
+ {"current_steps": 265, "total_steps": 942, "loss": 0.9951, "learning_rate": 4.519267264435309e-05, "epoch": 0.5623342175066313, "percentage": 28.13, "elapsed_time": "2:47:58", "remaining_time": "7:09:08", "throughput": "0.00", "total_tokens": 0}
56
+ {"current_steps": 270, "total_steps": 942, "loss": 1.0039, "learning_rate": 4.49158639114309e-05, "epoch": 0.5729442970822282, "percentage": 28.66, "elapsed_time": "2:50:53", "remaining_time": "7:05:20", "throughput": "0.00", "total_tokens": 0}
57
+ {"current_steps": 275, "total_steps": 942, "loss": 0.9739, "learning_rate": 4.463220566642715e-05, "epoch": 0.583554376657825, "percentage": 29.19, "elapsed_time": "2:53:41", "remaining_time": "7:01:17", "throughput": "0.00", "total_tokens": 0}
58
+ {"current_steps": 280, "total_steps": 942, "loss": 0.9864, "learning_rate": 4.434179546577146e-05, "epoch": 0.5941644562334217, "percentage": 29.72, "elapsed_time": "2:56:28", "remaining_time": "6:57:15", "throughput": "0.00", "total_tokens": 0}
59
+ {"current_steps": 285, "total_steps": 942, "loss": 0.9894, "learning_rate": 4.4044733188042384e-05, "epoch": 0.6047745358090185, "percentage": 30.25, "elapsed_time": "2:59:23", "remaining_time": "6:53:31", "throughput": "0.00", "total_tokens": 0}
60
+ {"current_steps": 290, "total_steps": 942, "loss": 0.9876, "learning_rate": 4.374112099961689e-05, "epoch": 0.6153846153846154, "percentage": 30.79, "elapsed_time": "3:02:14", "remaining_time": "6:49:44", "throughput": "0.00", "total_tokens": 0}
61
+ {"current_steps": 295, "total_steps": 942, "loss": 0.9979, "learning_rate": 4.34310633195331e-05, "epoch": 0.6259946949602122, "percentage": 31.32, "elapsed_time": "3:05:08", "remaining_time": "6:46:03", "throughput": "0.00", "total_tokens": 0}
62
+ {"current_steps": 300, "total_steps": 942, "loss": 0.9884, "learning_rate": 4.3114666783578195e-05, "epoch": 0.636604774535809, "percentage": 31.85, "elapsed_time": "3:08:01", "remaining_time": "6:42:22", "throughput": "0.00", "total_tokens": 0}
63
+ {"current_steps": 300, "total_steps": 942, "eval_loss": 0.9869238138198853, "epoch": 0.636604774535809, "percentage": 31.85, "elapsed_time": "3:16:22", "remaining_time": "7:00:15", "throughput": "0.00", "total_tokens": 0}
64
+ {"current_steps": 305, "total_steps": 942, "loss": 1.0002, "learning_rate": 4.2792040207614005e-05, "epoch": 0.6472148541114059, "percentage": 32.38, "elapsed_time": "3:19:18", "remaining_time": "6:56:16", "throughput": "0.00", "total_tokens": 0}
65
+ {"current_steps": 310, "total_steps": 942, "loss": 0.9769, "learning_rate": 4.2463294550152786e-05, "epoch": 0.6578249336870027, "percentage": 32.91, "elapsed_time": "3:22:18", "remaining_time": "6:52:26", "throughput": "0.00", "total_tokens": 0}
66
+ {"current_steps": 315, "total_steps": 942, "loss": 0.9958, "learning_rate": 4.212854287419611e-05, "epoch": 0.6684350132625995, "percentage": 33.44, "elapsed_time": "3:25:05", "remaining_time": "6:48:13", "throughput": "0.00", "total_tokens": 0}
67
+ {"current_steps": 320, "total_steps": 942, "loss": 0.9857, "learning_rate": 4.1787900308349924e-05, "epoch": 0.6790450928381963, "percentage": 33.97, "elapsed_time": "3:28:01", "remaining_time": "6:44:20", "throughput": "0.00", "total_tokens": 0}
68
+ {"current_steps": 325, "total_steps": 942, "loss": 0.9694, "learning_rate": 4.1441484007229314e-05, "epoch": 0.6896551724137931, "percentage": 34.5, "elapsed_time": "3:30:52", "remaining_time": "6:40:20", "throughput": "0.00", "total_tokens": 0}
69
+ {"current_steps": 330, "total_steps": 942, "loss": 0.9931, "learning_rate": 4.108941311116634e-05, "epoch": 0.7002652519893899, "percentage": 35.03, "elapsed_time": "3:33:40", "remaining_time": "6:36:16", "throughput": "0.00", "total_tokens": 0}
70
+ {"current_steps": 335, "total_steps": 942, "loss": 0.9584, "learning_rate": 4.073180870523503e-05, "epoch": 0.7108753315649867, "percentage": 35.56, "elapsed_time": "3:36:29", "remaining_time": "6:32:16", "throughput": "0.00", "total_tokens": 0}
71
+ {"current_steps": 340, "total_steps": 942, "loss": 0.9904, "learning_rate": 4.0368793777607524e-05, "epoch": 0.7214854111405835, "percentage": 36.09, "elapsed_time": "3:39:16", "remaining_time": "6:28:14", "throughput": "0.00", "total_tokens": 0}
72
+ {"current_steps": 345, "total_steps": 942, "loss": 0.9867, "learning_rate": 4.000049317725565e-05, "epoch": 0.7320954907161804, "percentage": 36.62, "elapsed_time": "3:42:05", "remaining_time": "6:24:19", "throughput": "0.00", "total_tokens": 0}
73
+ {"current_steps": 350, "total_steps": 942, "loss": 0.9905, "learning_rate": 3.9627033571012586e-05, "epoch": 0.7427055702917772, "percentage": 37.15, "elapsed_time": "3:44:59", "remaining_time": "6:20:33", "throughput": "0.00", "total_tokens": 0}
74
+ {"current_steps": 355, "total_steps": 942, "loss": 0.9901, "learning_rate": 3.924854340000931e-05, "epoch": 0.753315649867374, "percentage": 37.69, "elapsed_time": "3:47:52", "remaining_time": "6:16:47", "throughput": "0.00", "total_tokens": 0}
75
+ {"current_steps": 360, "total_steps": 942, "loss": 0.9806, "learning_rate": 3.886515283550079e-05, "epoch": 0.7639257294429708, "percentage": 38.22, "elapsed_time": "3:50:54", "remaining_time": "6:13:18", "throughput": "0.00", "total_tokens": 0}
76
+ {"current_steps": 365, "total_steps": 942, "loss": 0.9849, "learning_rate": 3.8476993734097155e-05, "epoch": 0.7745358090185677, "percentage": 38.75, "elapsed_time": "3:53:45", "remaining_time": "6:09:31", "throughput": "0.00", "total_tokens": 0}
77
+ {"current_steps": 370, "total_steps": 942, "loss": 0.9618, "learning_rate": 3.8084199592415305e-05, "epoch": 0.7851458885941645, "percentage": 39.28, "elapsed_time": "3:56:34", "remaining_time": "6:05:44", "throughput": "0.00", "total_tokens": 0}
78
+ {"current_steps": 375, "total_steps": 942, "loss": 0.9827, "learning_rate": 3.768690550116639e-05, "epoch": 0.7957559681697612, "percentage": 39.81, "elapsed_time": "3:59:30", "remaining_time": "6:02:08", "throughput": "0.00", "total_tokens": 0}
79
+ {"current_steps": 380, "total_steps": 942, "loss": 0.9896, "learning_rate": 3.728524809869511e-05, "epoch": 0.8063660477453581, "percentage": 40.34, "elapsed_time": "4:02:27", "remaining_time": "5:58:35", "throughput": "0.00", "total_tokens": 0}
80
+ {"current_steps": 385, "total_steps": 942, "loss": 1.0072, "learning_rate": 3.6879365523986706e-05, "epoch": 0.8169761273209549, "percentage": 40.87, "elapsed_time": "4:05:18", "remaining_time": "5:54:53", "throughput": "0.00", "total_tokens": 0}
81
+ {"current_steps": 390, "total_steps": 942, "loss": 0.9836, "learning_rate": 3.646939736915786e-05, "epoch": 0.8275862068965517, "percentage": 41.4, "elapsed_time": "4:08:09", "remaining_time": "5:51:14", "throughput": "0.00", "total_tokens": 0}
82
+ {"current_steps": 395, "total_steps": 942, "loss": 0.9666, "learning_rate": 3.605548463144786e-05, "epoch": 0.8381962864721485, "percentage": 41.93, "elapsed_time": "4:11:02", "remaining_time": "5:47:39", "throughput": "0.00", "total_tokens": 0}
83
+ {"current_steps": 400, "total_steps": 942, "loss": 0.9771, "learning_rate": 3.563776966472649e-05, "epoch": 0.8488063660477454, "percentage": 42.46, "elapsed_time": "4:13:51", "remaining_time": "5:43:59", "throughput": "0.00", "total_tokens": 0}
84
+ {"current_steps": 400, "total_steps": 942, "eval_loss": 0.9840684533119202, "epoch": 0.8488063660477454, "percentage": 42.46, "elapsed_time": "4:22:13", "remaining_time": "5:55:18", "throughput": "0.00", "total_tokens": 0}
85
+ {"current_steps": 405, "total_steps": 942, "loss": 0.9998, "learning_rate": 3.52163961305353e-05, "epoch": 0.8594164456233422, "percentage": 42.99, "elapsed_time": "4:25:03", "remaining_time": "5:51:27", "throughput": "0.00", "total_tokens": 0}
86
+ {"current_steps": 410, "total_steps": 942, "loss": 0.9685, "learning_rate": 3.479150894867926e-05, "epoch": 0.870026525198939, "percentage": 43.52, "elapsed_time": "4:28:03", "remaining_time": "5:47:49", "throughput": "0.00", "total_tokens": 0}
87
+ {"current_steps": 415, "total_steps": 942, "loss": 0.9895, "learning_rate": 3.436325424738549e-05, "epoch": 0.8806366047745358, "percentage": 44.06, "elapsed_time": "4:30:50", "remaining_time": "5:43:55", "throughput": "0.00", "total_tokens": 0}
88
+ {"current_steps": 420, "total_steps": 942, "loss": 0.9861, "learning_rate": 3.3931779313046574e-05, "epoch": 0.8912466843501327, "percentage": 44.59, "elapsed_time": "4:33:41", "remaining_time": "5:40:09", "throughput": "0.00", "total_tokens": 0}
89
+ {"current_steps": 425, "total_steps": 942, "loss": 0.9879, "learning_rate": 3.349723253956542e-05, "epoch": 0.9018567639257294, "percentage": 45.12, "elapsed_time": "4:36:36", "remaining_time": "5:36:29", "throughput": "0.00", "total_tokens": 0}
90
+ {"current_steps": 430, "total_steps": 942, "loss": 0.9686, "learning_rate": 3.3059763377319294e-05, "epoch": 0.9124668435013262, "percentage": 45.65, "elapsed_time": "4:39:26", "remaining_time": "5:32:43", "throughput": "0.00", "total_tokens": 0}
91
+ {"current_steps": 435, "total_steps": 942, "loss": 0.9697, "learning_rate": 3.261952228176044e-05, "epoch": 0.9230769230769231, "percentage": 46.18, "elapsed_time": "4:42:16", "remaining_time": "5:28:59", "throughput": "0.00", "total_tokens": 0}
92
+ {"current_steps": 440, "total_steps": 942, "loss": 0.991, "learning_rate": 3.217666066167117e-05, "epoch": 0.9336870026525199, "percentage": 46.71, "elapsed_time": "4:45:16", "remaining_time": "5:25:28", "throughput": "0.00", "total_tokens": 0}
93
+ {"current_steps": 445, "total_steps": 942, "loss": 0.9796, "learning_rate": 3.1731330827090865e-05, "epoch": 0.9442970822281167, "percentage": 47.24, "elapsed_time": "4:48:10", "remaining_time": "5:21:50", "throughput": "0.00", "total_tokens": 0}
94
+ {"current_steps": 450, "total_steps": 942, "loss": 0.9914, "learning_rate": 3.128368593693325e-05, "epoch": 0.9549071618037135, "percentage": 47.77, "elapsed_time": "4:51:00", "remaining_time": "5:18:09", "throughput": "0.00", "total_tokens": 0}
95
+ {"current_steps": 455, "total_steps": 942, "loss": 0.9934, "learning_rate": 3.083387994631154e-05, "epoch": 0.9655172413793104, "percentage": 48.3, "elapsed_time": "4:53:51", "remaining_time": "5:14:31", "throughput": "0.00", "total_tokens": 0}
96
+ {"current_steps": 460, "total_steps": 942, "loss": 0.9927, "learning_rate": 3.0382067553589867e-05, "epoch": 0.9761273209549072, "percentage": 48.83, "elapsed_time": "4:56:38", "remaining_time": "5:10:49", "throughput": "0.00", "total_tokens": 0}
97
+ {"current_steps": 465, "total_steps": 942, "loss": 0.9738, "learning_rate": 2.992840414717899e-05, "epoch": 0.986737400530504, "percentage": 49.36, "elapsed_time": "4:59:30", "remaining_time": "5:07:14", "throughput": "0.00", "total_tokens": 0}
98
+ {"current_steps": 470, "total_steps": 942, "loss": 0.9789, "learning_rate": 2.9473045752094818e-05, "epoch": 0.9973474801061007, "percentage": 49.89, "elapsed_time": "5:02:26", "remaining_time": "5:03:43", "throughput": "0.00", "total_tokens": 0}
99
+ {"current_steps": 475, "total_steps": 942, "loss": 0.9656, "learning_rate": 2.9016148976297832e-05, "epoch": 1.0079575596816976, "percentage": 50.42, "elapsed_time": "5:05:15", "remaining_time": "5:00:07", "throughput": "0.00", "total_tokens": 0}
100
+ {"current_steps": 480, "total_steps": 942, "loss": 0.9702, "learning_rate": 2.8557870956832132e-05, "epoch": 1.0185676392572944, "percentage": 50.96, "elapsed_time": "5:08:08", "remaining_time": "4:56:35", "throughput": "0.00", "total_tokens": 0}
101
+ {"current_steps": 485, "total_steps": 942, "loss": 0.9916, "learning_rate": 2.809836930578249e-05, "epoch": 1.0291777188328912, "percentage": 51.49, "elapsed_time": "5:11:00", "remaining_time": "4:53:03", "throughput": "0.00", "total_tokens": 0}
102
+ {"current_steps": 490, "total_steps": 942, "loss": 0.9879, "learning_rate": 2.7637802056068018e-05, "epoch": 1.039787798408488, "percentage": 52.02, "elapsed_time": "5:13:55", "remaining_time": "4:49:34", "throughput": "0.00", "total_tokens": 0}
103
+ {"current_steps": 495, "total_steps": 942, "loss": 0.9665, "learning_rate": 2.7176327607091075e-05, "epoch": 1.0503978779840848, "percentage": 52.55, "elapsed_time": "5:16:50", "remaining_time": "4:46:07", "throughput": "0.00", "total_tokens": 0}
104
+ {"current_steps": 500, "total_steps": 942, "loss": 0.9974, "learning_rate": 2.671410467026021e-05, "epoch": 1.0610079575596818, "percentage": 53.08, "elapsed_time": "5:19:45", "remaining_time": "4:42:40", "throughput": "0.00", "total_tokens": 0}
105
+ {"current_steps": 500, "total_steps": 942, "eval_loss": 0.9823258519172668, "epoch": 1.0610079575596818, "percentage": 53.08, "elapsed_time": "5:28:07", "remaining_time": "4:50:03", "throughput": "0.00", "total_tokens": 0}
106
+ {"current_steps": 505, "total_steps": 942, "loss": 0.981, "learning_rate": 2.625129221440569e-05, "epoch": 1.0716180371352786, "percentage": 53.61, "elapsed_time": "5:30:59", "remaining_time": "4:46:24", "throughput": "0.00", "total_tokens": 0}
107
+ {"current_steps": 510, "total_steps": 942, "loss": 1.0164, "learning_rate": 2.578804941110664e-05, "epoch": 1.0822281167108754, "percentage": 54.14, "elapsed_time": "5:33:46", "remaining_time": "4:42:43", "throughput": "0.00", "total_tokens": 0}
108
+ {"current_steps": 515, "total_steps": 942, "loss": 0.9685, "learning_rate": 2.5324535579948274e-05, "epoch": 1.0928381962864722, "percentage": 54.67, "elapsed_time": "5:36:36", "remaining_time": "4:39:05", "throughput": "0.00", "total_tokens": 0}
109
+ {"current_steps": 520, "total_steps": 942, "loss": 0.9723, "learning_rate": 2.4860910133728388e-05, "epoch": 1.103448275862069, "percentage": 55.2, "elapsed_time": "5:39:28", "remaining_time": "4:35:29", "throughput": "0.00", "total_tokens": 0}
110
+ {"current_steps": 525, "total_steps": 942, "loss": 0.9954, "learning_rate": 2.4397332523631684e-05, "epoch": 1.1140583554376657, "percentage": 55.73, "elapsed_time": "5:42:20", "remaining_time": "4:31:55", "throughput": "0.00", "total_tokens": 0}
111
+ {"current_steps": 530, "total_steps": 942, "loss": 0.9805, "learning_rate": 2.393396218439097e-05, "epoch": 1.1246684350132625, "percentage": 56.26, "elapsed_time": "5:45:06", "remaining_time": "4:28:16", "throughput": "0.00", "total_tokens": 0}
112
+ {"current_steps": 535, "total_steps": 942, "loss": 0.9785, "learning_rate": 2.3470958479453938e-05, "epoch": 1.1352785145888595, "percentage": 56.79, "elapsed_time": "5:47:56", "remaining_time": "4:24:41", "throughput": "0.00", "total_tokens": 0}
113
+ {"current_steps": 540, "total_steps": 942, "loss": 0.9934, "learning_rate": 2.3008480646174534e-05, "epoch": 1.1458885941644563, "percentage": 57.32, "elapsed_time": "5:50:41", "remaining_time": "4:21:04", "throughput": "0.00", "total_tokens": 0}
114
+ {"current_steps": 545, "total_steps": 942, "loss": 0.9989, "learning_rate": 2.2546687741047645e-05, "epoch": 1.156498673740053, "percentage": 57.86, "elapsed_time": "5:53:32", "remaining_time": "4:17:31", "throughput": "0.00", "total_tokens": 0}
115
+ {"current_steps": 550, "total_steps": 942, "loss": 0.9759, "learning_rate": 2.2085738585006024e-05, "epoch": 1.16710875331565, "percentage": 58.39, "elapsed_time": "5:56:25", "remaining_time": "4:14:02", "throughput": "0.00", "total_tokens": 0}
116
+ {"current_steps": 555, "total_steps": 942, "loss": 0.9783, "learning_rate": 2.1625791708798188e-05, "epoch": 1.1777188328912467, "percentage": 58.92, "elapsed_time": "5:59:13", "remaining_time": "4:10:29", "throughput": "0.00", "total_tokens": 0}
117
+ {"current_steps": 560, "total_steps": 942, "loss": 0.9713, "learning_rate": 2.1167005298466156e-05, "epoch": 1.1883289124668435, "percentage": 59.45, "elapsed_time": "6:02:09", "remaining_time": "4:07:02", "throughput": "0.00", "total_tokens": 0}
118
+ {"current_steps": 565, "total_steps": 942, "loss": 0.9654, "learning_rate": 2.0709537140941705e-05, "epoch": 1.1989389920424403, "percentage": 59.98, "elapsed_time": "6:04:56", "remaining_time": "4:03:30", "throughput": "0.00", "total_tokens": 0}
119
+ {"current_steps": 570, "total_steps": 942, "loss": 0.9661, "learning_rate": 2.0253544569779933e-05, "epoch": 1.209549071618037, "percentage": 60.51, "elapsed_time": "6:07:46", "remaining_time": "4:00:01", "throughput": "0.00", "total_tokens": 0}
120
+ {"current_steps": 575, "total_steps": 942, "loss": 0.9759, "learning_rate": 1.9799184411048695e-05, "epoch": 1.2201591511936338, "percentage": 61.04, "elapsed_time": "6:10:35", "remaining_time": "3:56:32", "throughput": "0.00", "total_tokens": 0}
121
+ {"current_steps": 580, "total_steps": 942, "loss": 0.9804, "learning_rate": 1.9346612929392636e-05, "epoch": 1.2307692307692308, "percentage": 61.57, "elapsed_time": "6:13:29", "remaining_time": "3:53:06", "throughput": "0.00", "total_tokens": 0}
122
+ {"current_steps": 585, "total_steps": 942, "loss": 0.9807, "learning_rate": 1.889598577429022e-05, "epoch": 1.2413793103448276, "percentage": 62.1, "elapsed_time": "6:16:24", "remaining_time": "3:49:42", "throughput": "0.00", "total_tokens": 0}
123
+ {"current_steps": 590, "total_steps": 942, "loss": 0.9945, "learning_rate": 1.8447457926522454e-05, "epoch": 1.2519893899204244, "percentage": 62.63, "elapsed_time": "6:19:16", "remaining_time": "3:46:16", "throughput": "0.00", "total_tokens": 0}
124
+ {"current_steps": 595, "total_steps": 942, "loss": 0.9769, "learning_rate": 1.800118364487146e-05, "epoch": 1.2625994694960212, "percentage": 63.16, "elapsed_time": "6:22:09", "remaining_time": "3:42:52", "throughput": "0.00", "total_tokens": 0}
125
+ {"current_steps": 600, "total_steps": 942, "loss": 0.9934, "learning_rate": 1.7557316413067488e-05, "epoch": 1.273209549071618, "percentage": 63.69, "elapsed_time": "6:25:05", "remaining_time": "3:39:30", "throughput": "0.00", "total_tokens": 0}
126
+ {"current_steps": 600, "total_steps": 942, "eval_loss": 0.9812601804733276, "epoch": 1.273209549071618, "percentage": 63.69, "elapsed_time": "6:33:27", "remaining_time": "3:44:16", "throughput": "0.00", "total_tokens": 0}
127
+ {"current_steps": 605, "total_steps": 942, "loss": 0.9622, "learning_rate": 1.7116008887002344e-05, "epoch": 1.2838196286472148, "percentage": 64.23, "elapsed_time": "6:36:30", "remaining_time": "3:40:51", "throughput": "0.00", "total_tokens": 0}
128
+ {"current_steps": 610, "total_steps": 942, "loss": 0.9831, "learning_rate": 1.667741284222768e-05, "epoch": 1.2944297082228116, "percentage": 64.76, "elapsed_time": "6:39:27", "remaining_time": "3:37:24", "throughput": "0.00", "total_tokens": 0}
129
+ {"current_steps": 615, "total_steps": 942, "loss": 0.9827, "learning_rate": 1.6241679121755914e-05, "epoch": 1.3050397877984086, "percentage": 65.29, "elapsed_time": "6:42:14", "remaining_time": "3:33:52", "throughput": "0.00", "total_tokens": 0}
130
+ {"current_steps": 620, "total_steps": 942, "loss": 0.9754, "learning_rate": 1.5808957584181998e-05, "epoch": 1.3156498673740054, "percentage": 65.82, "elapsed_time": "6:45:07", "remaining_time": "3:30:24", "throughput": "0.00", "total_tokens": 0}
131
+ {"current_steps": 625, "total_steps": 942, "loss": 0.9897, "learning_rate": 1.537939705214364e-05, "epoch": 1.3262599469496021, "percentage": 66.35, "elapsed_time": "6:47:57", "remaining_time": "3:26:54", "throughput": "0.00", "total_tokens": 0}
132
+ {"current_steps": 630, "total_steps": 942, "loss": 0.9797, "learning_rate": 1.4953145261137868e-05, "epoch": 1.336870026525199, "percentage": 66.88, "elapsed_time": "6:50:48", "remaining_time": "3:23:26", "throughput": "0.00", "total_tokens": 0}
133
+ {"current_steps": 635, "total_steps": 942, "loss": 0.9844, "learning_rate": 1.4530348808711508e-05, "epoch": 1.3474801061007957, "percentage": 67.41, "elapsed_time": "6:53:42", "remaining_time": "3:20:00", "throughput": "0.00", "total_tokens": 0}
134
+ {"current_steps": 640, "total_steps": 942, "loss": 0.9955, "learning_rate": 1.4111153104042993e-05, "epoch": 1.3580901856763925, "percentage": 67.94, "elapsed_time": "6:56:32", "remaining_time": "3:16:33", "throughput": "0.00", "total_tokens": 0}
135
+ {"current_steps": 645, "total_steps": 942, "loss": 0.998, "learning_rate": 1.3695702317932862e-05, "epoch": 1.3687002652519893, "percentage": 68.47, "elapsed_time": "6:59:23", "remaining_time": "3:13:06", "throughput": "0.00", "total_tokens": 0}
136
+ {"current_steps": 650, "total_steps": 942, "loss": 0.9832, "learning_rate": 1.3284139333220207e-05, "epoch": 1.3793103448275863, "percentage": 69.0, "elapsed_time": "7:02:15", "remaining_time": "3:09:41", "throughput": "0.00", "total_tokens": 0}
137
+ {"current_steps": 655, "total_steps": 942, "loss": 0.9818, "learning_rate": 1.2876605695642086e-05, "epoch": 1.389920424403183, "percentage": 69.53, "elapsed_time": "7:05:08", "remaining_time": "3:06:16", "throughput": "0.00", "total_tokens": 0}
138
+ {"current_steps": 660, "total_steps": 942, "loss": 0.976, "learning_rate": 1.247324156515271e-05, "epoch": 1.4005305039787799, "percentage": 70.06, "elapsed_time": "7:08:00", "remaining_time": "3:02:52", "throughput": "0.00", "total_tokens": 0}
139
+ {"current_steps": 665, "total_steps": 942, "loss": 0.9489, "learning_rate": 1.2074185667719353e-05, "epoch": 1.4111405835543767, "percentage": 70.59, "elapsed_time": "7:11:01", "remaining_time": "2:59:32", "throughput": "0.00", "total_tokens": 0}
140
+ {"current_steps": 670, "total_steps": 942, "loss": 0.9765, "learning_rate": 1.1679575247611341e-05, "epoch": 1.4217506631299734, "percentage": 71.13, "elapsed_time": "7:13:52", "remaining_time": "2:56:08", "throughput": "0.00", "total_tokens": 0}
141
+ {"current_steps": 675, "total_steps": 942, "loss": 0.9934, "learning_rate": 1.1289546020198719e-05, "epoch": 1.4323607427055702, "percentage": 71.66, "elapsed_time": "7:16:43", "remaining_time": "2:52:44", "throughput": "0.00", "total_tokens": 0}
142
+ {"current_steps": 680, "total_steps": 942, "loss": 0.9728, "learning_rate": 1.0904232125276609e-05, "epoch": 1.442970822281167, "percentage": 72.19, "elapsed_time": "7:19:33", "remaining_time": "2:49:21", "throughput": "0.00", "total_tokens": 0}
143
+ {"current_steps": 685, "total_steps": 942, "loss": 0.9811, "learning_rate": 1.052376608093162e-05, "epoch": 1.453580901856764, "percentage": 72.72, "elapsed_time": "7:22:24", "remaining_time": "2:45:58", "throughput": "0.00", "total_tokens": 0}
144
+ {"current_steps": 690, "total_steps": 942, "loss": 0.9653, "learning_rate": 1.0148278737965845e-05, "epoch": 1.4641909814323608, "percentage": 73.25, "elapsed_time": "7:25:17", "remaining_time": "2:42:37", "throughput": "0.00", "total_tokens": 0}
145
+ {"current_steps": 695, "total_steps": 942, "loss": 0.9735, "learning_rate": 9.777899234894387e-06, "epoch": 1.4748010610079576, "percentage": 73.78, "elapsed_time": "7:28:10", "remaining_time": "2:39:16", "throughput": "0.00", "total_tokens": 0}
146
+ {"current_steps": 700, "total_steps": 942, "loss": 0.9738, "learning_rate": 9.412754953531663e-06, "epoch": 1.4854111405835544, "percentage": 74.31, "elapsed_time": "7:31:03", "remaining_time": "2:35:56", "throughput": "0.00", "total_tokens": 0}
147
+ {"current_steps": 700, "total_steps": 942, "eval_loss": 0.9805117249488831, "epoch": 1.4854111405835544, "percentage": 74.31, "elapsed_time": "7:39:24", "remaining_time": "2:38:49", "throughput": "0.00", "total_tokens": 0}
148
+ {"current_steps": 705, "total_steps": 942, "loss": 0.976, "learning_rate": 9.052971475182004e-06, "epoch": 1.4960212201591512, "percentage": 74.84, "elapsed_time": "7:42:19", "remaining_time": "2:35:25", "throughput": "0.00", "total_tokens": 0}
149
+ {"current_steps": 710, "total_steps": 942, "loss": 0.9678, "learning_rate": 8.698672537449385e-06, "epoch": 1.506631299734748, "percentage": 75.37, "elapsed_time": "7:45:00", "remaining_time": "2:31:56", "throughput": "0.00", "total_tokens": 0}
150
+ {"current_steps": 715, "total_steps": 942, "loss": 0.9707, "learning_rate": 8.349979991681333e-06, "epoch": 1.5172413793103448, "percentage": 75.9, "elapsed_time": "7:47:54", "remaining_time": "2:28:33", "throughput": "0.00", "total_tokens": 0}
151
+ {"current_steps": 720, "total_steps": 942, "loss": 1.0067, "learning_rate": 8.00701376106148e-06, "epoch": 1.5278514588859418, "percentage": 76.43, "elapsed_time": "7:50:51", "remaining_time": "2:25:10", "throughput": "0.00", "total_tokens": 0}
152
+ {"current_steps": 725, "total_steps": 942, "loss": 0.9753, "learning_rate": 7.669891799365283e-06, "epoch": 1.5384615384615383, "percentage": 76.96, "elapsed_time": "7:53:48", "remaining_time": "2:21:48", "throughput": "0.00", "total_tokens": 0}
153
+ {"current_steps": 730, "total_steps": 942, "loss": 0.9703, "learning_rate": 7.338730050393114e-06, "epoch": 1.5490716180371353, "percentage": 77.49, "elapsed_time": "7:56:41", "remaining_time": "2:18:26", "throughput": "0.00", "total_tokens": 0}
154
+ {"current_steps": 735, "total_steps": 942, "loss": 0.9804, "learning_rate": 7.01364240809459e-06, "epoch": 1.5596816976127321, "percentage": 78.03, "elapsed_time": "7:59:36", "remaining_time": "2:15:04", "throughput": "0.00", "total_tokens": 0}
155
+ {"current_steps": 740, "total_steps": 942, "loss": 0.9809, "learning_rate": 6.694740677397845e-06, "epoch": 1.570291777188329, "percentage": 78.56, "elapsed_time": "8:02:30", "remaining_time": "2:11:42", "throughput": "0.00", "total_tokens": 0}
156
+ {"current_steps": 745, "total_steps": 942, "loss": 0.9675, "learning_rate": 6.382134535757339e-06, "epoch": 1.5809018567639257, "percentage": 79.09, "elapsed_time": "8:05:21", "remaining_time": "2:08:20", "throughput": "0.00", "total_tokens": 0}
157
+ {"current_steps": 750, "total_steps": 942, "loss": 0.9792, "learning_rate": 6.075931495433315e-06, "epoch": 1.5915119363395225, "percentage": 79.62, "elapsed_time": "8:08:07", "remaining_time": "2:04:57", "throughput": "0.00", "total_tokens": 0}
158
+ {"current_steps": 755, "total_steps": 942, "loss": 0.9759, "learning_rate": 5.776236866515947e-06, "epoch": 1.6021220159151195, "percentage": 80.15, "elapsed_time": "8:11:00", "remaining_time": "2:01:36", "throughput": "0.00", "total_tokens": 0}
159
+ {"current_steps": 760, "total_steps": 942, "loss": 0.9756, "learning_rate": 5.483153720706799e-06, "epoch": 1.612732095490716, "percentage": 80.68, "elapsed_time": "8:13:54", "remaining_time": "1:58:16", "throughput": "0.00", "total_tokens": 0}
160
+ {"current_steps": 765, "total_steps": 942, "loss": 0.9719, "learning_rate": 5.19678285587018e-06, "epoch": 1.623342175066313, "percentage": 81.21, "elapsed_time": "8:16:44", "remaining_time": "1:54:55", "throughput": "0.00", "total_tokens": 0}
161
+ {"current_steps": 770, "total_steps": 942, "loss": 1.0034, "learning_rate": 4.917222761366477e-06, "epoch": 1.6339522546419099, "percentage": 81.74, "elapsed_time": "8:19:30", "remaining_time": "1:51:34", "throughput": "0.00", "total_tokens": 0}
162
+ {"current_steps": 775, "total_steps": 942, "loss": 0.9789, "learning_rate": 4.644569584179509e-06, "epoch": 1.6445623342175066, "percentage": 82.27, "elapsed_time": "8:22:22", "remaining_time": "1:48:15", "throughput": "0.00", "total_tokens": 0}
163
+ {"current_steps": 780, "total_steps": 942, "loss": 0.9751, "learning_rate": 4.3789170958493585e-06, "epoch": 1.6551724137931034, "percentage": 82.8, "elapsed_time": "8:25:04", "remaining_time": "1:44:54", "throughput": "0.00", "total_tokens": 0}
164
+ {"current_steps": 785, "total_steps": 942, "loss": 0.9915, "learning_rate": 4.1203566602222745e-06, "epoch": 1.6657824933687002, "percentage": 83.33, "elapsed_time": "8:27:53", "remaining_time": "1:41:34", "throughput": "0.00", "total_tokens": 0}
165
+ {"current_steps": 790, "total_steps": 942, "loss": 0.973, "learning_rate": 3.868977202028581e-06, "epoch": 1.6763925729442972, "percentage": 83.86, "elapsed_time": "8:30:36", "remaining_time": "1:38:14", "throughput": "0.00", "total_tokens": 0}
166
+ {"current_steps": 795, "total_steps": 942, "loss": 0.9931, "learning_rate": 3.6248651762994995e-06, "epoch": 1.6870026525198938, "percentage": 84.39, "elapsed_time": "8:33:24", "remaining_time": "1:34:55", "throughput": "0.00", "total_tokens": 0}
167
+ {"current_steps": 800, "total_steps": 942, "loss": 0.9744, "learning_rate": 3.38810453863328e-06, "epoch": 1.6976127320954908, "percentage": 84.93, "elapsed_time": "8:36:17", "remaining_time": "1:31:38", "throughput": "0.00", "total_tokens": 0}
168
+ {"current_steps": 800, "total_steps": 942, "eval_loss": 0.9801440834999084, "epoch": 1.6976127320954908, "percentage": 84.93, "elapsed_time": "8:44:39", "remaining_time": "1:33:07", "throughput": "0.00", "total_tokens": 0}
169
+ {"current_steps": 805, "total_steps": 942, "loss": 0.9817, "learning_rate": 3.1587767163210157e-06, "epoch": 1.7082228116710876, "percentage": 85.46, "elapsed_time": "8:47:31", "remaining_time": "1:29:46", "throughput": "0.00", "total_tokens": 0}
170
+ {"current_steps": 810, "total_steps": 942, "loss": 0.9809, "learning_rate": 2.9369605803419715e-06, "epoch": 1.7188328912466844, "percentage": 85.99, "elapsed_time": "8:50:19", "remaining_time": "1:26:25", "throughput": "0.00", "total_tokens": 0}
171
+ {"current_steps": 815, "total_steps": 942, "loss": 0.9654, "learning_rate": 2.7227324182380775e-06, "epoch": 1.7294429708222812, "percentage": 86.52, "elapsed_time": "8:53:06", "remaining_time": "1:23:04", "throughput": "0.00", "total_tokens": 0}
172
+ {"current_steps": 820, "total_steps": 942, "loss": 0.9634, "learning_rate": 2.5161659078769466e-06, "epoch": 1.740053050397878, "percentage": 87.05, "elapsed_time": "8:55:53", "remaining_time": "1:19:43", "throughput": "0.00", "total_tokens": 0}
173
+ {"current_steps": 825, "total_steps": 942, "loss": 0.9866, "learning_rate": 2.317332092112384e-06, "epoch": 1.750663129973475, "percentage": 87.58, "elapsed_time": "8:58:41", "remaining_time": "1:16:23", "throughput": "0.00", "total_tokens": 0}
174
+ {"current_steps": 830, "total_steps": 942, "loss": 0.9833, "learning_rate": 2.1262993543511717e-06, "epoch": 1.7612732095490715, "percentage": 88.11, "elapsed_time": "9:01:36", "remaining_time": "1:13:05", "throughput": "0.00", "total_tokens": 0}
175
+ {"current_steps": 835, "total_steps": 942, "loss": 0.9785, "learning_rate": 1.9431333950344855e-06, "epoch": 1.7718832891246685, "percentage": 88.64, "elapsed_time": "9:04:31", "remaining_time": "1:09:46", "throughput": "0.00", "total_tokens": 0}
176
+ {"current_steps": 840, "total_steps": 942, "loss": 0.9752, "learning_rate": 1.767897209042027e-06, "epoch": 1.782493368700265, "percentage": 89.17, "elapsed_time": "9:07:33", "remaining_time": "1:06:29", "throughput": "0.00", "total_tokens": 0}
177
+ {"current_steps": 845, "total_steps": 942, "loss": 0.9846, "learning_rate": 1.6006510640266787e-06, "epoch": 1.793103448275862, "percentage": 89.7, "elapsed_time": "9:10:14", "remaining_time": "1:03:09", "throughput": "0.00", "total_tokens": 0}
178
+ {"current_steps": 850, "total_steps": 942, "loss": 0.9624, "learning_rate": 1.4414524796871027e-06, "epoch": 1.8037135278514589, "percentage": 90.23, "elapsed_time": "9:13:04", "remaining_time": "0:59:51", "throughput": "0.00", "total_tokens": 0}
179
+ {"current_steps": 855, "total_steps": 942, "loss": 0.9817, "learning_rate": 1.2903562079854492e-06, "epoch": 1.8143236074270557, "percentage": 90.76, "elapsed_time": "9:15:55", "remaining_time": "0:56:34", "throughput": "0.00", "total_tokens": 0}
180
+ {"current_steps": 860, "total_steps": 942, "loss": 1.0072, "learning_rate": 1.1474142143168832e-06, "epoch": 1.8249336870026527, "percentage": 91.3, "elapsed_time": "9:18:40", "remaining_time": "0:53:16", "throughput": "0.00", "total_tokens": 0}
181
+ {"current_steps": 865, "total_steps": 942, "loss": 0.9737, "learning_rate": 1.0126756596375686e-06, "epoch": 1.8355437665782492, "percentage": 91.83, "elapsed_time": "9:21:29", "remaining_time": "0:49:58", "throughput": "0.00", "total_tokens": 0}
182
+ {"current_steps": 870, "total_steps": 942, "loss": 0.9675, "learning_rate": 8.86186883557083e-07, "epoch": 1.8461538461538463, "percentage": 92.36, "elapsed_time": "9:24:26", "remaining_time": "0:46:42", "throughput": "0.00", "total_tokens": 0}
183
+ {"current_steps": 875, "total_steps": 942, "loss": 0.9718, "learning_rate": 7.679913884012069e-07, "epoch": 1.8567639257294428, "percentage": 92.89, "elapsed_time": "9:27:11", "remaining_time": "0:43:25", "throughput": "0.00", "total_tokens": 0}
184
+ {"current_steps": 880, "total_steps": 942, "loss": 0.9787, "learning_rate": 6.58129824250478e-07, "epoch": 1.8673740053050398, "percentage": 93.42, "elapsed_time": "9:30:03", "remaining_time": "0:40:09", "throughput": "0.00", "total_tokens": 0}
185
+ {"current_steps": 885, "total_steps": 942, "loss": 0.9511, "learning_rate": 5.566399749597328e-07, "epoch": 1.8779840848806366, "percentage": 93.95, "elapsed_time": "9:32:57", "remaining_time": "0:36:54", "throughput": "0.00", "total_tokens": 0}
186
+ {"current_steps": 890, "total_steps": 942, "loss": 0.9849, "learning_rate": 4.635567451633821e-07, "epoch": 1.8885941644562334, "percentage": 94.48, "elapsed_time": "9:35:50", "remaining_time": "0:33:38", "throughput": "0.00", "total_tokens": 0}
187
+ {"current_steps": 895, "total_steps": 942, "loss": 0.9765, "learning_rate": 3.789121482709407e-07, "epoch": 1.8992042440318302, "percentage": 95.01, "elapsed_time": "9:38:44", "remaining_time": "0:30:23", "throughput": "0.00", "total_tokens": 0}
188
+ {"current_steps": 900, "total_steps": 942, "loss": 0.9887, "learning_rate": 3.027352954568713e-07, "epoch": 1.909814323607427, "percentage": 95.54, "elapsed_time": "9:41:36", "remaining_time": "0:27:08", "throughput": "0.00", "total_tokens": 0}
189
+ {"current_steps": 900, "total_steps": 942, "eval_loss": 0.9800187945365906, "epoch": 1.909814323607427, "percentage": 95.54, "elapsed_time": "9:49:58", "remaining_time": "0:27:31", "throughput": "0.00", "total_tokens": 0}
190
+ {"current_steps": 905, "total_steps": 942, "loss": 0.9813, "learning_rate": 2.350523856486292e-07, "epoch": 1.920424403183024, "percentage": 96.07, "elapsed_time": "9:53:00", "remaining_time": "0:24:14", "throughput": "0.00", "total_tokens": 0}
191
+ {"current_steps": 910, "total_steps": 942, "loss": 0.9907, "learning_rate": 1.7588669651623368e-07, "epoch": 1.9310344827586206, "percentage": 96.6, "elapsed_time": "9:55:57", "remaining_time": "0:20:57", "throughput": "0.00", "total_tokens": 0}
192
+ {"current_steps": 915, "total_steps": 942, "loss": 0.9686, "learning_rate": 1.2525857646658312e-07, "epoch": 1.9416445623342176, "percentage": 97.13, "elapsed_time": "9:58:45", "remaining_time": "0:17:40", "throughput": "0.00", "total_tokens": 0}
193
+ {"current_steps": 920, "total_steps": 942, "loss": 0.9882, "learning_rate": 8.318543764516961e-08, "epoch": 1.9522546419098143, "percentage": 97.66, "elapsed_time": "10:01:35", "remaining_time": "0:14:23", "throughput": "0.00", "total_tokens": 0}
194
+ {"current_steps": 925, "total_steps": 942, "loss": 0.9643, "learning_rate": 4.968174994764152e-08, "epoch": 1.9628647214854111, "percentage": 98.2, "elapsed_time": "10:04:33", "remaining_time": "0:11:06", "throughput": "0.00", "total_tokens": 0}
195
+ {"current_steps": 930, "total_steps": 942, "loss": 0.9801, "learning_rate": 2.4759036043300875e-08, "epoch": 1.973474801061008, "percentage": 98.73, "elapsed_time": "10:07:32", "remaining_time": "0:07:50", "throughput": "0.00", "total_tokens": 0}
196
+ {"current_steps": 935, "total_steps": 942, "loss": 0.9604, "learning_rate": 8.42586741219009e-09, "epoch": 1.9840848806366047, "percentage": 99.26, "elapsed_time": "10:10:19", "remaining_time": "0:04:34", "throughput": "0.00", "total_tokens": 0}
197
+ {"current_steps": 940, "total_steps": 942, "loss": 0.9835, "learning_rate": 6.878613971583736e-10, "epoch": 1.9946949602122017, "percentage": 99.79, "elapsed_time": "10:13:08", "remaining_time": "0:01:18", "throughput": "0.00", "total_tokens": 0}
198
+ {"current_steps": 942, "total_steps": 942, "epoch": 1.9989389920424403, "percentage": 100.0, "elapsed_time": "10:14:26", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
qwen2-7b-instruct-traininfer-instruct0625/ckpt/trainer_state.json ADDED
@@ -0,0 +1,1430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9989389920424403,
5
+ "eval_steps": 100,
6
+ "global_step": 942,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.010610079575596816,
13
+ "grad_norm": 0.07623420818069793,
14
+ "learning_rate": 2.631578947368421e-06,
15
+ "loss": 1.0589,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.021220159151193633,
20
+ "grad_norm": 0.07205664213007502,
21
+ "learning_rate": 5.263157894736842e-06,
22
+ "loss": 1.0432,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.03183023872679045,
27
+ "grad_norm": 0.07626950292061684,
28
+ "learning_rate": 7.894736842105263e-06,
29
+ "loss": 1.0265,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.042440318302387266,
34
+ "grad_norm": 0.08705716193396125,
35
+ "learning_rate": 1.0526315789473684e-05,
36
+ "loss": 1.0666,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.05305039787798409,
41
+ "grad_norm": 0.08703221369797066,
42
+ "learning_rate": 1.3157894736842106e-05,
43
+ "loss": 1.0659,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.0636604774535809,
48
+ "grad_norm": 0.07909046201171775,
49
+ "learning_rate": 1.5789473684210526e-05,
50
+ "loss": 1.0403,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.07427055702917772,
55
+ "grad_norm": 0.06926697000806412,
56
+ "learning_rate": 1.8421052631578947e-05,
57
+ "loss": 1.0297,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.08488063660477453,
62
+ "grad_norm": 0.06368642008991743,
63
+ "learning_rate": 2.105263157894737e-05,
64
+ "loss": 1.0295,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.09549071618037135,
69
+ "grad_norm": 0.07386077960990603,
70
+ "learning_rate": 2.368421052631579e-05,
71
+ "loss": 1.0332,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.10610079575596817,
76
+ "grad_norm": 0.056736689937553715,
77
+ "learning_rate": 2.6315789473684212e-05,
78
+ "loss": 1.0529,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.11671087533156499,
83
+ "grad_norm": 0.054506530378128165,
84
+ "learning_rate": 2.8947368421052634e-05,
85
+ "loss": 1.0286,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.1273209549071618,
90
+ "grad_norm": 0.05335225285700553,
91
+ "learning_rate": 3.157894736842105e-05,
92
+ "loss": 1.0347,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.13793103448275862,
97
+ "grad_norm": 0.04664862379258267,
98
+ "learning_rate": 3.421052631578947e-05,
99
+ "loss": 1.0234,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.14854111405835543,
104
+ "grad_norm": 0.04835549496136054,
105
+ "learning_rate": 3.6842105263157895e-05,
106
+ "loss": 1.0313,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.15915119363395225,
111
+ "grad_norm": 0.04800181980380734,
112
+ "learning_rate": 3.9473684210526316e-05,
113
+ "loss": 1.0114,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.16976127320954906,
118
+ "grad_norm": 0.04677268624263905,
119
+ "learning_rate": 4.210526315789474e-05,
120
+ "loss": 1.0158,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.18037135278514588,
125
+ "grad_norm": 0.04581493024930908,
126
+ "learning_rate": 4.473684210526316e-05,
127
+ "loss": 1.0187,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.1909814323607427,
132
+ "grad_norm": 0.05373519219769595,
133
+ "learning_rate": 4.736842105263158e-05,
134
+ "loss": 1.0084,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.20159151193633953,
139
+ "grad_norm": 0.048548759937219306,
140
+ "learning_rate": 5e-05,
141
+ "loss": 1.0017,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.21220159151193635,
146
+ "grad_norm": 0.04992085587516446,
147
+ "learning_rate": 4.999570096976961e-05,
148
+ "loss": 1.0282,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.21220159151193635,
153
+ "eval_loss": 1.0028369426727295,
154
+ "eval_runtime": 501.3877,
155
+ "eval_samples_per_second": 26.734,
156
+ "eval_steps_per_second": 1.671,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.22281167108753316,
161
+ "grad_norm": 0.0508347009907372,
162
+ "learning_rate": 4.998280535761132e-05,
163
+ "loss": 0.9892,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.23342175066312998,
168
+ "grad_norm": 0.05151620741113757,
169
+ "learning_rate": 4.996131759861523e-05,
170
+ "loss": 1.0344,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.2440318302387268,
175
+ "grad_norm": 0.051059943915805414,
176
+ "learning_rate": 4.99312450829034e-05,
177
+ "loss": 0.9892,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.2546419098143236,
182
+ "grad_norm": 0.05925864823530942,
183
+ "learning_rate": 4.989259815308815e-05,
184
+ "loss": 1.0061,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.26525198938992045,
189
+ "grad_norm": 0.05985023245860431,
190
+ "learning_rate": 4.984539010071506e-05,
191
+ "loss": 1.0031,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.27586206896551724,
196
+ "grad_norm": 0.05755101663405019,
197
+ "learning_rate": 4.978963716169166e-05,
198
+ "loss": 0.9758,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.2864721485411141,
203
+ "grad_norm": 0.06563201267943408,
204
+ "learning_rate": 4.972535851070358e-05,
205
+ "loss": 1.0091,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.29708222811671087,
210
+ "grad_norm": 0.05863974474653586,
211
+ "learning_rate": 4.965257625461992e-05,
212
+ "loss": 0.9945,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.3076923076923077,
217
+ "grad_norm": 0.05932391323295519,
218
+ "learning_rate": 4.957131542489021e-05,
219
+ "loss": 0.9934,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 0.3183023872679045,
224
+ "grad_norm": 0.06076133148832644,
225
+ "learning_rate": 4.948160396893553e-05,
226
+ "loss": 1.0224,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 0.32891246684350134,
231
+ "grad_norm": 0.059180711274686784,
232
+ "learning_rate": 4.9383472740536785e-05,
233
+ "loss": 0.9976,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 0.3395225464190981,
238
+ "grad_norm": 0.06405548051430922,
239
+ "learning_rate": 4.927695548922335e-05,
240
+ "loss": 1.0121,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.35013262599469497,
245
+ "grad_norm": 0.07064952148771018,
246
+ "learning_rate": 4.916208884866593e-05,
247
+ "loss": 1.0045,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 0.36074270557029176,
252
+ "grad_norm": 0.06166758944403884,
253
+ "learning_rate": 4.9038912324077315e-05,
254
+ "loss": 1.0037,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 0.3713527851458886,
259
+ "grad_norm": 0.06639359858554056,
260
+ "learning_rate": 4.8907468278625747e-05,
261
+ "loss": 1.0125,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 0.3819628647214854,
266
+ "grad_norm": 0.06726002562984101,
267
+ "learning_rate": 4.876780191886523e-05,
268
+ "loss": 1.0041,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 0.3925729442970822,
273
+ "grad_norm": 0.06791796540898594,
274
+ "learning_rate": 4.861996127918798e-05,
275
+ "loss": 0.9925,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 0.40318302387267907,
280
+ "grad_norm": 0.06940095499199088,
281
+ "learning_rate": 4.846399720530434e-05,
282
+ "loss": 0.9977,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 0.41379310344827586,
287
+ "grad_norm": 0.06478068174424859,
288
+ "learning_rate": 4.8299963336755784e-05,
289
+ "loss": 0.99,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 0.4244031830238727,
294
+ "grad_norm": 0.06917834842275485,
295
+ "learning_rate": 4.81279160884671e-05,
296
+ "loss": 1.0122,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 0.4244031830238727,
301
+ "eval_loss": 0.9916980266571045,
302
+ "eval_runtime": 501.6523,
303
+ "eval_samples_per_second": 26.72,
304
+ "eval_steps_per_second": 1.67,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.4350132625994695,
309
+ "grad_norm": 0.06793364380058203,
310
+ "learning_rate": 4.794791463134399e-05,
311
+ "loss": 0.9895,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.44562334217506633,
316
+ "grad_norm": 0.07479898864312906,
317
+ "learning_rate": 4.7760020871922914e-05,
318
+ "loss": 1.0012,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.4562334217506631,
323
+ "grad_norm": 0.07448217111087417,
324
+ "learning_rate": 4.7564299431080016e-05,
325
+ "loss": 0.9883,
326
+ "step": 215
327
+ },
328
+ {
329
+ "epoch": 0.46684350132625996,
330
+ "grad_norm": 0.07102920253452748,
331
+ "learning_rate": 4.736081762180658e-05,
332
+ "loss": 1.0064,
333
+ "step": 220
334
+ },
335
+ {
336
+ "epoch": 0.47745358090185674,
337
+ "grad_norm": 0.0740137877754686,
338
+ "learning_rate": 4.714964542605855e-05,
339
+ "loss": 1.0064,
340
+ "step": 225
341
+ },
342
+ {
343
+ "epoch": 0.4880636604774536,
344
+ "grad_norm": 0.06837487045141567,
345
+ "learning_rate": 4.69308554706882e-05,
346
+ "loss": 0.9942,
347
+ "step": 230
348
+ },
349
+ {
350
+ "epoch": 0.4986737400530504,
351
+ "grad_norm": 0.07204206463584076,
352
+ "learning_rate": 4.67045230024661e-05,
353
+ "loss": 0.9857,
354
+ "step": 235
355
+ },
356
+ {
357
+ "epoch": 0.5092838196286472,
358
+ "grad_norm": 0.07396728806729067,
359
+ "learning_rate": 4.64707258622021e-05,
360
+ "loss": 0.9769,
361
+ "step": 240
362
+ },
363
+ {
364
+ "epoch": 0.519893899204244,
365
+ "grad_norm": 0.07411133999786101,
366
+ "learning_rate": 4.622954445797409e-05,
367
+ "loss": 0.9993,
368
+ "step": 245
369
+ },
370
+ {
371
+ "epoch": 0.5305039787798409,
372
+ "grad_norm": 0.07672751862649076,
373
+ "learning_rate": 4.5981061737473904e-05,
374
+ "loss": 0.9958,
375
+ "step": 250
376
+ },
377
+ {
378
+ "epoch": 0.5411140583554377,
379
+ "grad_norm": 0.07549422167935345,
380
+ "learning_rate": 4.572536315947971e-05,
381
+ "loss": 0.9801,
382
+ "step": 255
383
+ },
384
+ {
385
+ "epoch": 0.5517241379310345,
386
+ "grad_norm": 0.07551777425138942,
387
+ "learning_rate": 4.546253666446484e-05,
388
+ "loss": 0.9937,
389
+ "step": 260
390
+ },
391
+ {
392
+ "epoch": 0.5623342175066313,
393
+ "grad_norm": 0.07657102092212426,
394
+ "learning_rate": 4.519267264435309e-05,
395
+ "loss": 0.9951,
396
+ "step": 265
397
+ },
398
+ {
399
+ "epoch": 0.5729442970822282,
400
+ "grad_norm": 0.07861407498734943,
401
+ "learning_rate": 4.49158639114309e-05,
402
+ "loss": 1.0039,
403
+ "step": 270
404
+ },
405
+ {
406
+ "epoch": 0.583554376657825,
407
+ "grad_norm": 0.07564071240328456,
408
+ "learning_rate": 4.463220566642715e-05,
409
+ "loss": 0.9739,
410
+ "step": 275
411
+ },
412
+ {
413
+ "epoch": 0.5941644562334217,
414
+ "grad_norm": 0.08386678281152504,
415
+ "learning_rate": 4.434179546577146e-05,
416
+ "loss": 0.9864,
417
+ "step": 280
418
+ },
419
+ {
420
+ "epoch": 0.6047745358090185,
421
+ "grad_norm": 0.0802924700957904,
422
+ "learning_rate": 4.4044733188042384e-05,
423
+ "loss": 0.9894,
424
+ "step": 285
425
+ },
426
+ {
427
+ "epoch": 0.6153846153846154,
428
+ "grad_norm": 0.08070465999022375,
429
+ "learning_rate": 4.374112099961689e-05,
430
+ "loss": 0.9876,
431
+ "step": 290
432
+ },
433
+ {
434
+ "epoch": 0.6259946949602122,
435
+ "grad_norm": 0.08270967742116453,
436
+ "learning_rate": 4.34310633195331e-05,
437
+ "loss": 0.9979,
438
+ "step": 295
439
+ },
440
+ {
441
+ "epoch": 0.636604774535809,
442
+ "grad_norm": 0.08009831243849455,
443
+ "learning_rate": 4.3114666783578195e-05,
444
+ "loss": 0.9884,
445
+ "step": 300
446
+ },
447
+ {
448
+ "epoch": 0.636604774535809,
449
+ "eval_loss": 0.9869238138198853,
450
+ "eval_runtime": 501.56,
451
+ "eval_samples_per_second": 26.725,
452
+ "eval_steps_per_second": 1.671,
453
+ "step": 300
454
+ },
455
+ {
456
+ "epoch": 0.6472148541114059,
457
+ "grad_norm": 0.08352239373025634,
458
+ "learning_rate": 4.2792040207614005e-05,
459
+ "loss": 1.0002,
460
+ "step": 305
461
+ },
462
+ {
463
+ "epoch": 0.6578249336870027,
464
+ "grad_norm": 0.07918803174823354,
465
+ "learning_rate": 4.2463294550152786e-05,
466
+ "loss": 0.9769,
467
+ "step": 310
468
+ },
469
+ {
470
+ "epoch": 0.6684350132625995,
471
+ "grad_norm": 0.07978777758565579,
472
+ "learning_rate": 4.212854287419611e-05,
473
+ "loss": 0.9958,
474
+ "step": 315
475
+ },
476
+ {
477
+ "epoch": 0.6790450928381963,
478
+ "grad_norm": 0.07751762504569489,
479
+ "learning_rate": 4.1787900308349924e-05,
480
+ "loss": 0.9857,
481
+ "step": 320
482
+ },
483
+ {
484
+ "epoch": 0.6896551724137931,
485
+ "grad_norm": 0.0799008768730751,
486
+ "learning_rate": 4.1441484007229314e-05,
487
+ "loss": 0.9694,
488
+ "step": 325
489
+ },
490
+ {
491
+ "epoch": 0.7002652519893899,
492
+ "grad_norm": 0.08036989463755223,
493
+ "learning_rate": 4.108941311116634e-05,
494
+ "loss": 0.9931,
495
+ "step": 330
496
+ },
497
+ {
498
+ "epoch": 0.7108753315649867,
499
+ "grad_norm": 0.07942444342254458,
500
+ "learning_rate": 4.073180870523503e-05,
501
+ "loss": 0.9584,
502
+ "step": 335
503
+ },
504
+ {
505
+ "epoch": 0.7214854111405835,
506
+ "grad_norm": 0.08600410405491565,
507
+ "learning_rate": 4.0368793777607524e-05,
508
+ "loss": 0.9904,
509
+ "step": 340
510
+ },
511
+ {
512
+ "epoch": 0.7320954907161804,
513
+ "grad_norm": 0.07890904511326151,
514
+ "learning_rate": 4.000049317725565e-05,
515
+ "loss": 0.9867,
516
+ "step": 345
517
+ },
518
+ {
519
+ "epoch": 0.7427055702917772,
520
+ "grad_norm": 0.08405665680455511,
521
+ "learning_rate": 3.9627033571012586e-05,
522
+ "loss": 0.9905,
523
+ "step": 350
524
+ },
525
+ {
526
+ "epoch": 0.753315649867374,
527
+ "grad_norm": 0.07844813100517245,
528
+ "learning_rate": 3.924854340000931e-05,
529
+ "loss": 0.9901,
530
+ "step": 355
531
+ },
532
+ {
533
+ "epoch": 0.7639257294429708,
534
+ "grad_norm": 0.08284034103558724,
535
+ "learning_rate": 3.886515283550079e-05,
536
+ "loss": 0.9806,
537
+ "step": 360
538
+ },
539
+ {
540
+ "epoch": 0.7745358090185677,
541
+ "grad_norm": 0.08189138625693122,
542
+ "learning_rate": 3.8476993734097155e-05,
543
+ "loss": 0.9849,
544
+ "step": 365
545
+ },
546
+ {
547
+ "epoch": 0.7851458885941645,
548
+ "grad_norm": 0.08045617857202153,
549
+ "learning_rate": 3.8084199592415305e-05,
550
+ "loss": 0.9618,
551
+ "step": 370
552
+ },
553
+ {
554
+ "epoch": 0.7957559681697612,
555
+ "grad_norm": 0.08175433626726658,
556
+ "learning_rate": 3.768690550116639e-05,
557
+ "loss": 0.9827,
558
+ "step": 375
559
+ },
560
+ {
561
+ "epoch": 0.8063660477453581,
562
+ "grad_norm": 0.08203297851648732,
563
+ "learning_rate": 3.728524809869511e-05,
564
+ "loss": 0.9896,
565
+ "step": 380
566
+ },
567
+ {
568
+ "epoch": 0.8169761273209549,
569
+ "grad_norm": 0.08613988767270629,
570
+ "learning_rate": 3.6879365523986706e-05,
571
+ "loss": 1.0072,
572
+ "step": 385
573
+ },
574
+ {
575
+ "epoch": 0.8275862068965517,
576
+ "grad_norm": 0.08663299741096588,
577
+ "learning_rate": 3.646939736915786e-05,
578
+ "loss": 0.9836,
579
+ "step": 390
580
+ },
581
+ {
582
+ "epoch": 0.8381962864721485,
583
+ "grad_norm": 0.08220940782461005,
584
+ "learning_rate": 3.605548463144786e-05,
585
+ "loss": 0.9666,
586
+ "step": 395
587
+ },
588
+ {
589
+ "epoch": 0.8488063660477454,
590
+ "grad_norm": 0.08065282059812733,
591
+ "learning_rate": 3.563776966472649e-05,
592
+ "loss": 0.9771,
593
+ "step": 400
594
+ },
595
+ {
596
+ "epoch": 0.8488063660477454,
597
+ "eval_loss": 0.9840684533119202,
598
+ "eval_runtime": 501.5164,
599
+ "eval_samples_per_second": 26.727,
600
+ "eval_steps_per_second": 1.671,
601
+ "step": 400
602
+ },
603
+ {
604
+ "epoch": 0.8594164456233422,
605
+ "grad_norm": 0.08650228379554649,
606
+ "learning_rate": 3.52163961305353e-05,
607
+ "loss": 0.9998,
608
+ "step": 405
609
+ },
610
+ {
611
+ "epoch": 0.870026525198939,
612
+ "grad_norm": 0.08387979383384966,
613
+ "learning_rate": 3.479150894867926e-05,
614
+ "loss": 0.9685,
615
+ "step": 410
616
+ },
617
+ {
618
+ "epoch": 0.8806366047745358,
619
+ "grad_norm": 0.08721899001858337,
620
+ "learning_rate": 3.436325424738549e-05,
621
+ "loss": 0.9895,
622
+ "step": 415
623
+ },
624
+ {
625
+ "epoch": 0.8912466843501327,
626
+ "grad_norm": 0.08653678608089069,
627
+ "learning_rate": 3.3931779313046574e-05,
628
+ "loss": 0.9861,
629
+ "step": 420
630
+ },
631
+ {
632
+ "epoch": 0.9018567639257294,
633
+ "grad_norm": 0.08927106779743564,
634
+ "learning_rate": 3.349723253956542e-05,
635
+ "loss": 0.9879,
636
+ "step": 425
637
+ },
638
+ {
639
+ "epoch": 0.9124668435013262,
640
+ "grad_norm": 0.09299794688285011,
641
+ "learning_rate": 3.3059763377319294e-05,
642
+ "loss": 0.9686,
643
+ "step": 430
644
+ },
645
+ {
646
+ "epoch": 0.9230769230769231,
647
+ "grad_norm": 0.08250762921336344,
648
+ "learning_rate": 3.261952228176044e-05,
649
+ "loss": 0.9697,
650
+ "step": 435
651
+ },
652
+ {
653
+ "epoch": 0.9336870026525199,
654
+ "grad_norm": 0.08464622856430148,
655
+ "learning_rate": 3.217666066167117e-05,
656
+ "loss": 0.991,
657
+ "step": 440
658
+ },
659
+ {
660
+ "epoch": 0.9442970822281167,
661
+ "grad_norm": 0.08831657667074837,
662
+ "learning_rate": 3.1731330827090865e-05,
663
+ "loss": 0.9796,
664
+ "step": 445
665
+ },
666
+ {
667
+ "epoch": 0.9549071618037135,
668
+ "grad_norm": 0.0875900719250125,
669
+ "learning_rate": 3.128368593693325e-05,
670
+ "loss": 0.9914,
671
+ "step": 450
672
+ },
673
+ {
674
+ "epoch": 0.9655172413793104,
675
+ "grad_norm": 0.09632644759370569,
676
+ "learning_rate": 3.083387994631154e-05,
677
+ "loss": 0.9934,
678
+ "step": 455
679
+ },
680
+ {
681
+ "epoch": 0.9761273209549072,
682
+ "grad_norm": 0.0890756143568433,
683
+ "learning_rate": 3.0382067553589867e-05,
684
+ "loss": 0.9927,
685
+ "step": 460
686
+ },
687
+ {
688
+ "epoch": 0.986737400530504,
689
+ "grad_norm": 0.08262187491301902,
690
+ "learning_rate": 2.992840414717899e-05,
691
+ "loss": 0.9738,
692
+ "step": 465
693
+ },
694
+ {
695
+ "epoch": 0.9973474801061007,
696
+ "grad_norm": 0.08825771969765614,
697
+ "learning_rate": 2.9473045752094818e-05,
698
+ "loss": 0.9789,
699
+ "step": 470
700
+ },
701
+ {
702
+ "epoch": 1.0079575596816976,
703
+ "grad_norm": 0.08895467811064861,
704
+ "learning_rate": 2.9016148976297832e-05,
705
+ "loss": 0.9656,
706
+ "step": 475
707
+ },
708
+ {
709
+ "epoch": 1.0185676392572944,
710
+ "grad_norm": 0.08772274514325573,
711
+ "learning_rate": 2.8557870956832132e-05,
712
+ "loss": 0.9702,
713
+ "step": 480
714
+ },
715
+ {
716
+ "epoch": 1.0291777188328912,
717
+ "grad_norm": 0.08961456548197023,
718
+ "learning_rate": 2.809836930578249e-05,
719
+ "loss": 0.9916,
720
+ "step": 485
721
+ },
722
+ {
723
+ "epoch": 1.039787798408488,
724
+ "grad_norm": 0.09423600327500739,
725
+ "learning_rate": 2.7637802056068018e-05,
726
+ "loss": 0.9879,
727
+ "step": 490
728
+ },
729
+ {
730
+ "epoch": 1.0503978779840848,
731
+ "grad_norm": 0.08849197341901133,
732
+ "learning_rate": 2.7176327607091075e-05,
733
+ "loss": 0.9665,
734
+ "step": 495
735
+ },
736
+ {
737
+ "epoch": 1.0610079575596818,
738
+ "grad_norm": 0.09073880855625993,
739
+ "learning_rate": 2.671410467026021e-05,
740
+ "loss": 0.9974,
741
+ "step": 500
742
+ },
743
+ {
744
+ "epoch": 1.0610079575596818,
745
+ "eval_loss": 0.9823258519172668,
746
+ "eval_runtime": 501.7907,
747
+ "eval_samples_per_second": 26.712,
748
+ "eval_steps_per_second": 1.67,
749
+ "step": 500
750
+ },
751
+ {
752
+ "epoch": 1.0716180371352786,
753
+ "grad_norm": 0.09183816285643769,
754
+ "learning_rate": 2.625129221440569e-05,
755
+ "loss": 0.981,
756
+ "step": 505
757
+ },
758
+ {
759
+ "epoch": 1.0822281167108754,
760
+ "grad_norm": 0.0946815619794241,
761
+ "learning_rate": 2.578804941110664e-05,
762
+ "loss": 1.0164,
763
+ "step": 510
764
+ },
765
+ {
766
+ "epoch": 1.0928381962864722,
767
+ "grad_norm": 0.10079876133979877,
768
+ "learning_rate": 2.5324535579948274e-05,
769
+ "loss": 0.9685,
770
+ "step": 515
771
+ },
772
+ {
773
+ "epoch": 1.103448275862069,
774
+ "grad_norm": 0.09357417526833312,
775
+ "learning_rate": 2.4860910133728388e-05,
776
+ "loss": 0.9723,
777
+ "step": 520
778
+ },
779
+ {
780
+ "epoch": 1.1140583554376657,
781
+ "grad_norm": 0.09538254274812408,
782
+ "learning_rate": 2.4397332523631684e-05,
783
+ "loss": 0.9954,
784
+ "step": 525
785
+ },
786
+ {
787
+ "epoch": 1.1246684350132625,
788
+ "grad_norm": 0.09322492263056234,
789
+ "learning_rate": 2.393396218439097e-05,
790
+ "loss": 0.9805,
791
+ "step": 530
792
+ },
793
+ {
794
+ "epoch": 1.1352785145888595,
795
+ "grad_norm": 0.09381667388866403,
796
+ "learning_rate": 2.3470958479453938e-05,
797
+ "loss": 0.9785,
798
+ "step": 535
799
+ },
800
+ {
801
+ "epoch": 1.1458885941644563,
802
+ "grad_norm": 0.0984535713170549,
803
+ "learning_rate": 2.3008480646174534e-05,
804
+ "loss": 0.9934,
805
+ "step": 540
806
+ },
807
+ {
808
+ "epoch": 1.156498673740053,
809
+ "grad_norm": 0.0976123186479438,
810
+ "learning_rate": 2.2546687741047645e-05,
811
+ "loss": 0.9989,
812
+ "step": 545
813
+ },
814
+ {
815
+ "epoch": 1.16710875331565,
816
+ "grad_norm": 0.09777674870174517,
817
+ "learning_rate": 2.2085738585006024e-05,
818
+ "loss": 0.9759,
819
+ "step": 550
820
+ },
821
+ {
822
+ "epoch": 1.1777188328912467,
823
+ "grad_norm": 0.10035436324164396,
824
+ "learning_rate": 2.1625791708798188e-05,
825
+ "loss": 0.9783,
826
+ "step": 555
827
+ },
828
+ {
829
+ "epoch": 1.1883289124668435,
830
+ "grad_norm": 0.09847514609619938,
831
+ "learning_rate": 2.1167005298466156e-05,
832
+ "loss": 0.9713,
833
+ "step": 560
834
+ },
835
+ {
836
+ "epoch": 1.1989389920424403,
837
+ "grad_norm": 0.09672521232190742,
838
+ "learning_rate": 2.0709537140941705e-05,
839
+ "loss": 0.9654,
840
+ "step": 565
841
+ },
842
+ {
843
+ "epoch": 1.209549071618037,
844
+ "grad_norm": 0.09847144343966878,
845
+ "learning_rate": 2.0253544569779933e-05,
846
+ "loss": 0.9661,
847
+ "step": 570
848
+ },
849
+ {
850
+ "epoch": 1.2201591511936338,
851
+ "grad_norm": 0.0989651039802197,
852
+ "learning_rate": 1.9799184411048695e-05,
853
+ "loss": 0.9759,
854
+ "step": 575
855
+ },
856
+ {
857
+ "epoch": 1.2307692307692308,
858
+ "grad_norm": 0.09888803396418354,
859
+ "learning_rate": 1.9346612929392636e-05,
860
+ "loss": 0.9804,
861
+ "step": 580
862
+ },
863
+ {
864
+ "epoch": 1.2413793103448276,
865
+ "grad_norm": 0.09734456390328389,
866
+ "learning_rate": 1.889598577429022e-05,
867
+ "loss": 0.9807,
868
+ "step": 585
869
+ },
870
+ {
871
+ "epoch": 1.2519893899204244,
872
+ "grad_norm": 0.10124183430533916,
873
+ "learning_rate": 1.8447457926522454e-05,
874
+ "loss": 0.9945,
875
+ "step": 590
876
+ },
877
+ {
878
+ "epoch": 1.2625994694960212,
879
+ "grad_norm": 0.0991604825386959,
880
+ "learning_rate": 1.800118364487146e-05,
881
+ "loss": 0.9769,
882
+ "step": 595
883
+ },
884
+ {
885
+ "epoch": 1.273209549071618,
886
+ "grad_norm": 0.09660029282065204,
887
+ "learning_rate": 1.7557316413067488e-05,
888
+ "loss": 0.9934,
889
+ "step": 600
890
+ },
891
+ {
892
+ "epoch": 1.273209549071618,
893
+ "eval_loss": 0.9812601804733276,
894
+ "eval_runtime": 501.6223,
895
+ "eval_samples_per_second": 26.721,
896
+ "eval_steps_per_second": 1.671,
897
+ "step": 600
898
+ },
899
+ {
900
+ "epoch": 1.2838196286472148,
901
+ "grad_norm": 0.09872097381002237,
902
+ "learning_rate": 1.7116008887002344e-05,
903
+ "loss": 0.9622,
904
+ "step": 605
905
+ },
906
+ {
907
+ "epoch": 1.2944297082228116,
908
+ "grad_norm": 0.09522222343603413,
909
+ "learning_rate": 1.667741284222768e-05,
910
+ "loss": 0.9831,
911
+ "step": 610
912
+ },
913
+ {
914
+ "epoch": 1.3050397877984086,
915
+ "grad_norm": 0.1014944316104097,
916
+ "learning_rate": 1.6241679121755914e-05,
917
+ "loss": 0.9827,
918
+ "step": 615
919
+ },
920
+ {
921
+ "epoch": 1.3156498673740054,
922
+ "grad_norm": 0.10149346811583228,
923
+ "learning_rate": 1.5808957584181998e-05,
924
+ "loss": 0.9754,
925
+ "step": 620
926
+ },
927
+ {
928
+ "epoch": 1.3262599469496021,
929
+ "grad_norm": 0.0958760217857124,
930
+ "learning_rate": 1.537939705214364e-05,
931
+ "loss": 0.9897,
932
+ "step": 625
933
+ },
934
+ {
935
+ "epoch": 1.336870026525199,
936
+ "grad_norm": 0.10206592940194624,
937
+ "learning_rate": 1.4953145261137868e-05,
938
+ "loss": 0.9797,
939
+ "step": 630
940
+ },
941
+ {
942
+ "epoch": 1.3474801061007957,
943
+ "grad_norm": 0.10224543719732158,
944
+ "learning_rate": 1.4530348808711508e-05,
945
+ "loss": 0.9844,
946
+ "step": 635
947
+ },
948
+ {
949
+ "epoch": 1.3580901856763925,
950
+ "grad_norm": 0.0998882895360292,
951
+ "learning_rate": 1.4111153104042993e-05,
952
+ "loss": 0.9955,
953
+ "step": 640
954
+ },
955
+ {
956
+ "epoch": 1.3687002652519893,
957
+ "grad_norm": 0.09440011868080461,
958
+ "learning_rate": 1.3695702317932862e-05,
959
+ "loss": 0.998,
960
+ "step": 645
961
+ },
962
+ {
963
+ "epoch": 1.3793103448275863,
964
+ "grad_norm": 0.10070400345672928,
965
+ "learning_rate": 1.3284139333220207e-05,
966
+ "loss": 0.9832,
967
+ "step": 650
968
+ },
969
+ {
970
+ "epoch": 1.389920424403183,
971
+ "grad_norm": 0.10284785595425723,
972
+ "learning_rate": 1.2876605695642086e-05,
973
+ "loss": 0.9818,
974
+ "step": 655
975
+ },
976
+ {
977
+ "epoch": 1.4005305039787799,
978
+ "grad_norm": 0.09907296614307597,
979
+ "learning_rate": 1.247324156515271e-05,
980
+ "loss": 0.976,
981
+ "step": 660
982
+ },
983
+ {
984
+ "epoch": 1.4111405835543767,
985
+ "grad_norm": 0.09857949146993086,
986
+ "learning_rate": 1.2074185667719353e-05,
987
+ "loss": 0.9489,
988
+ "step": 665
989
+ },
990
+ {
991
+ "epoch": 1.4217506631299734,
992
+ "grad_norm": 0.1027503153542044,
993
+ "learning_rate": 1.1679575247611341e-05,
994
+ "loss": 0.9765,
995
+ "step": 670
996
+ },
997
+ {
998
+ "epoch": 1.4323607427055702,
999
+ "grad_norm": 0.09647369113256919,
1000
+ "learning_rate": 1.1289546020198719e-05,
1001
+ "loss": 0.9934,
1002
+ "step": 675
1003
+ },
1004
+ {
1005
+ "epoch": 1.442970822281167,
1006
+ "grad_norm": 0.09839399148271344,
1007
+ "learning_rate": 1.0904232125276609e-05,
1008
+ "loss": 0.9728,
1009
+ "step": 680
1010
+ },
1011
+ {
1012
+ "epoch": 1.453580901856764,
1013
+ "grad_norm": 0.10156504921281477,
1014
+ "learning_rate": 1.052376608093162e-05,
1015
+ "loss": 0.9811,
1016
+ "step": 685
1017
+ },
1018
+ {
1019
+ "epoch": 1.4641909814323608,
1020
+ "grad_norm": 0.10257219505892362,
1021
+ "learning_rate": 1.0148278737965845e-05,
1022
+ "loss": 0.9653,
1023
+ "step": 690
1024
+ },
1025
+ {
1026
+ "epoch": 1.4748010610079576,
1027
+ "grad_norm": 0.09960716843498467,
1028
+ "learning_rate": 9.777899234894387e-06,
1029
+ "loss": 0.9735,
1030
+ "step": 695
1031
+ },
1032
+ {
1033
+ "epoch": 1.4854111405835544,
1034
+ "grad_norm": 0.09941623700354961,
1035
+ "learning_rate": 9.412754953531663e-06,
1036
+ "loss": 0.9738,
1037
+ "step": 700
1038
+ },
1039
+ {
1040
+ "epoch": 1.4854111405835544,
1041
+ "eval_loss": 0.9805117249488831,
1042
+ "eval_runtime": 501.4908,
1043
+ "eval_samples_per_second": 26.728,
1044
+ "eval_steps_per_second": 1.671,
1045
+ "step": 700
1046
+ },
1047
+ {
1048
+ "epoch": 1.4960212201591512,
1049
+ "grad_norm": 0.09699569939298165,
1050
+ "learning_rate": 9.052971475182004e-06,
1051
+ "loss": 0.976,
1052
+ "step": 705
1053
+ },
1054
+ {
1055
+ "epoch": 1.506631299734748,
1056
+ "grad_norm": 0.1022949810528641,
1057
+ "learning_rate": 8.698672537449385e-06,
1058
+ "loss": 0.9678,
1059
+ "step": 710
1060
+ },
1061
+ {
1062
+ "epoch": 1.5172413793103448,
1063
+ "grad_norm": 0.09925245872600137,
1064
+ "learning_rate": 8.349979991681333e-06,
1065
+ "loss": 0.9707,
1066
+ "step": 715
1067
+ },
1068
+ {
1069
+ "epoch": 1.5278514588859418,
1070
+ "grad_norm": 0.0978077000211089,
1071
+ "learning_rate": 8.00701376106148e-06,
1072
+ "loss": 1.0067,
1073
+ "step": 720
1074
+ },
1075
+ {
1076
+ "epoch": 1.5384615384615383,
1077
+ "grad_norm": 0.09702110693617705,
1078
+ "learning_rate": 7.669891799365283e-06,
1079
+ "loss": 0.9753,
1080
+ "step": 725
1081
+ },
1082
+ {
1083
+ "epoch": 1.5490716180371353,
1084
+ "grad_norm": 0.10106616057187415,
1085
+ "learning_rate": 7.338730050393114e-06,
1086
+ "loss": 0.9703,
1087
+ "step": 730
1088
+ },
1089
+ {
1090
+ "epoch": 1.5596816976127321,
1091
+ "grad_norm": 0.10139301720670488,
1092
+ "learning_rate": 7.01364240809459e-06,
1093
+ "loss": 0.9804,
1094
+ "step": 735
1095
+ },
1096
+ {
1097
+ "epoch": 1.570291777188329,
1098
+ "grad_norm": 0.10543185047759056,
1099
+ "learning_rate": 6.694740677397845e-06,
1100
+ "loss": 0.9809,
1101
+ "step": 740
1102
+ },
1103
+ {
1104
+ "epoch": 1.5809018567639257,
1105
+ "grad_norm": 0.09810200524296686,
1106
+ "learning_rate": 6.382134535757339e-06,
1107
+ "loss": 0.9675,
1108
+ "step": 745
1109
+ },
1110
+ {
1111
+ "epoch": 1.5915119363395225,
1112
+ "grad_norm": 0.10063678881940345,
1113
+ "learning_rate": 6.075931495433315e-06,
1114
+ "loss": 0.9792,
1115
+ "step": 750
1116
+ },
1117
+ {
1118
+ "epoch": 1.6021220159151195,
1119
+ "grad_norm": 0.10751838250373763,
1120
+ "learning_rate": 5.776236866515947e-06,
1121
+ "loss": 0.9759,
1122
+ "step": 755
1123
+ },
1124
+ {
1125
+ "epoch": 1.612732095490716,
1126
+ "grad_norm": 0.10251266957258413,
1127
+ "learning_rate": 5.483153720706799e-06,
1128
+ "loss": 0.9756,
1129
+ "step": 760
1130
+ },
1131
+ {
1132
+ "epoch": 1.623342175066313,
1133
+ "grad_norm": 0.10576679802248473,
1134
+ "learning_rate": 5.19678285587018e-06,
1135
+ "loss": 0.9719,
1136
+ "step": 765
1137
+ },
1138
+ {
1139
+ "epoch": 1.6339522546419099,
1140
+ "grad_norm": 0.0979386252610659,
1141
+ "learning_rate": 4.917222761366477e-06,
1142
+ "loss": 1.0034,
1143
+ "step": 770
1144
+ },
1145
+ {
1146
+ "epoch": 1.6445623342175066,
1147
+ "grad_norm": 0.09861558820444138,
1148
+ "learning_rate": 4.644569584179509e-06,
1149
+ "loss": 0.9789,
1150
+ "step": 775
1151
+ },
1152
+ {
1153
+ "epoch": 1.6551724137931034,
1154
+ "grad_norm": 0.10337492990116616,
1155
+ "learning_rate": 4.3789170958493585e-06,
1156
+ "loss": 0.9751,
1157
+ "step": 780
1158
+ },
1159
+ {
1160
+ "epoch": 1.6657824933687002,
1161
+ "grad_norm": 0.0973808745211656,
1162
+ "learning_rate": 4.1203566602222745e-06,
1163
+ "loss": 0.9915,
1164
+ "step": 785
1165
+ },
1166
+ {
1167
+ "epoch": 1.6763925729442972,
1168
+ "grad_norm": 0.09761203241646615,
1169
+ "learning_rate": 3.868977202028581e-06,
1170
+ "loss": 0.973,
1171
+ "step": 790
1172
+ },
1173
+ {
1174
+ "epoch": 1.6870026525198938,
1175
+ "grad_norm": 0.10171457056462775,
1176
+ "learning_rate": 3.6248651762994995e-06,
1177
+ "loss": 0.9931,
1178
+ "step": 795
1179
+ },
1180
+ {
1181
+ "epoch": 1.6976127320954908,
1182
+ "grad_norm": 0.10461287792457526,
1183
+ "learning_rate": 3.38810453863328e-06,
1184
+ "loss": 0.9744,
1185
+ "step": 800
1186
+ },
1187
+ {
1188
+ "epoch": 1.6976127320954908,
1189
+ "eval_loss": 0.9801440834999084,
1190
+ "eval_runtime": 501.708,
1191
+ "eval_samples_per_second": 26.717,
1192
+ "eval_steps_per_second": 1.67,
1193
+ "step": 800
1194
+ },
1195
+ {
1196
+ "epoch": 1.7082228116710876,
1197
+ "grad_norm": 0.10531636550074536,
1198
+ "learning_rate": 3.1587767163210157e-06,
1199
+ "loss": 0.9817,
1200
+ "step": 805
1201
+ },
1202
+ {
1203
+ "epoch": 1.7188328912466844,
1204
+ "grad_norm": 0.10694585581807346,
1205
+ "learning_rate": 2.9369605803419715e-06,
1206
+ "loss": 0.9809,
1207
+ "step": 810
1208
+ },
1209
+ {
1210
+ "epoch": 1.7294429708222812,
1211
+ "grad_norm": 0.10505045842111997,
1212
+ "learning_rate": 2.7227324182380775e-06,
1213
+ "loss": 0.9654,
1214
+ "step": 815
1215
+ },
1216
+ {
1217
+ "epoch": 1.740053050397878,
1218
+ "grad_norm": 0.10697090247086784,
1219
+ "learning_rate": 2.5161659078769466e-06,
1220
+ "loss": 0.9634,
1221
+ "step": 820
1222
+ },
1223
+ {
1224
+ "epoch": 1.750663129973475,
1225
+ "grad_norm": 0.10337571820075339,
1226
+ "learning_rate": 2.317332092112384e-06,
1227
+ "loss": 0.9866,
1228
+ "step": 825
1229
+ },
1230
+ {
1231
+ "epoch": 1.7612732095490715,
1232
+ "grad_norm": 0.09992375037287819,
1233
+ "learning_rate": 2.1262993543511717e-06,
1234
+ "loss": 0.9833,
1235
+ "step": 830
1236
+ },
1237
+ {
1238
+ "epoch": 1.7718832891246685,
1239
+ "grad_norm": 0.1002631099347442,
1240
+ "learning_rate": 1.9431333950344855e-06,
1241
+ "loss": 0.9785,
1242
+ "step": 835
1243
+ },
1244
+ {
1245
+ "epoch": 1.782493368700265,
1246
+ "grad_norm": 0.10479627463164663,
1247
+ "learning_rate": 1.767897209042027e-06,
1248
+ "loss": 0.9752,
1249
+ "step": 840
1250
+ },
1251
+ {
1252
+ "epoch": 1.793103448275862,
1253
+ "grad_norm": 0.10197107953524158,
1254
+ "learning_rate": 1.6006510640266787e-06,
1255
+ "loss": 0.9846,
1256
+ "step": 845
1257
+ },
1258
+ {
1259
+ "epoch": 1.8037135278514589,
1260
+ "grad_norm": 0.10167121112531119,
1261
+ "learning_rate": 1.4414524796871027e-06,
1262
+ "loss": 0.9624,
1263
+ "step": 850
1264
+ },
1265
+ {
1266
+ "epoch": 1.8143236074270557,
1267
+ "grad_norm": 0.09922610672387365,
1268
+ "learning_rate": 1.2903562079854492e-06,
1269
+ "loss": 0.9817,
1270
+ "step": 855
1271
+ },
1272
+ {
1273
+ "epoch": 1.8249336870026527,
1274
+ "grad_norm": 0.1048631234042592,
1275
+ "learning_rate": 1.1474142143168832e-06,
1276
+ "loss": 1.0072,
1277
+ "step": 860
1278
+ },
1279
+ {
1280
+ "epoch": 1.8355437665782492,
1281
+ "grad_norm": 0.09750361912686112,
1282
+ "learning_rate": 1.0126756596375686e-06,
1283
+ "loss": 0.9737,
1284
+ "step": 865
1285
+ },
1286
+ {
1287
+ "epoch": 1.8461538461538463,
1288
+ "grad_norm": 0.10178011267304485,
1289
+ "learning_rate": 8.86186883557083e-07,
1290
+ "loss": 0.9675,
1291
+ "step": 870
1292
+ },
1293
+ {
1294
+ "epoch": 1.8567639257294428,
1295
+ "grad_norm": 0.10344721675019256,
1296
+ "learning_rate": 7.679913884012069e-07,
1297
+ "loss": 0.9718,
1298
+ "step": 875
1299
+ },
1300
+ {
1301
+ "epoch": 1.8673740053050398,
1302
+ "grad_norm": 0.09806593314586452,
1303
+ "learning_rate": 6.58129824250478e-07,
1304
+ "loss": 0.9787,
1305
+ "step": 880
1306
+ },
1307
+ {
1308
+ "epoch": 1.8779840848806366,
1309
+ "grad_norm": 0.10272666481587975,
1310
+ "learning_rate": 5.566399749597328e-07,
1311
+ "loss": 0.9511,
1312
+ "step": 885
1313
+ },
1314
+ {
1315
+ "epoch": 1.8885941644562334,
1316
+ "grad_norm": 0.10081265185788726,
1317
+ "learning_rate": 4.635567451633821e-07,
1318
+ "loss": 0.9849,
1319
+ "step": 890
1320
+ },
1321
+ {
1322
+ "epoch": 1.8992042440318302,
1323
+ "grad_norm": 0.10227171696148227,
1324
+ "learning_rate": 3.789121482709407e-07,
1325
+ "loss": 0.9765,
1326
+ "step": 895
1327
+ },
1328
+ {
1329
+ "epoch": 1.909814323607427,
1330
+ "grad_norm": 0.10128987382975041,
1331
+ "learning_rate": 3.027352954568713e-07,
1332
+ "loss": 0.9887,
1333
+ "step": 900
1334
+ },
1335
+ {
1336
+ "epoch": 1.909814323607427,
1337
+ "eval_loss": 0.9800187945365906,
1338
+ "eval_runtime": 501.7687,
1339
+ "eval_samples_per_second": 26.714,
1340
+ "eval_steps_per_second": 1.67,
1341
+ "step": 900
1342
+ },
1343
+ {
1344
+ "epoch": 1.920424403183024,
1345
+ "grad_norm": 0.10888174926639403,
1346
+ "learning_rate": 2.350523856486292e-07,
1347
+ "loss": 0.9813,
1348
+ "step": 905
1349
+ },
1350
+ {
1351
+ "epoch": 1.9310344827586206,
1352
+ "grad_norm": 0.09593621401437472,
1353
+ "learning_rate": 1.7588669651623368e-07,
1354
+ "loss": 0.9907,
1355
+ "step": 910
1356
+ },
1357
+ {
1358
+ "epoch": 1.9416445623342176,
1359
+ "grad_norm": 0.09834431957222971,
1360
+ "learning_rate": 1.2525857646658312e-07,
1361
+ "loss": 0.9686,
1362
+ "step": 915
1363
+ },
1364
+ {
1365
+ "epoch": 1.9522546419098143,
1366
+ "grad_norm": 0.09987466210480013,
1367
+ "learning_rate": 8.318543764516961e-08,
1368
+ "loss": 0.9882,
1369
+ "step": 920
1370
+ },
1371
+ {
1372
+ "epoch": 1.9628647214854111,
1373
+ "grad_norm": 0.10365551816637686,
1374
+ "learning_rate": 4.968174994764152e-08,
1375
+ "loss": 0.9643,
1376
+ "step": 925
1377
+ },
1378
+ {
1379
+ "epoch": 1.973474801061008,
1380
+ "grad_norm": 0.09717537741476147,
1381
+ "learning_rate": 2.4759036043300875e-08,
1382
+ "loss": 0.9801,
1383
+ "step": 930
1384
+ },
1385
+ {
1386
+ "epoch": 1.9840848806366047,
1387
+ "grad_norm": 0.09721331281181161,
1388
+ "learning_rate": 8.42586741219009e-09,
1389
+ "loss": 0.9604,
1390
+ "step": 935
1391
+ },
1392
+ {
1393
+ "epoch": 1.9946949602122017,
1394
+ "grad_norm": 0.10113910306283141,
1395
+ "learning_rate": 6.878613971583736e-10,
1396
+ "loss": 0.9835,
1397
+ "step": 940
1398
+ },
1399
+ {
1400
+ "epoch": 1.9989389920424403,
1401
+ "step": 942,
1402
+ "total_flos": 1.4259794694102843e+19,
1403
+ "train_loss": 0.9894429036513003,
1404
+ "train_runtime": 36869.1128,
1405
+ "train_samples_per_second": 6.544,
1406
+ "train_steps_per_second": 0.026
1407
+ }
1408
+ ],
1409
+ "logging_steps": 5,
1410
+ "max_steps": 942,
1411
+ "num_input_tokens_seen": 0,
1412
+ "num_train_epochs": 2,
1413
+ "save_steps": 300,
1414
+ "stateful_callbacks": {
1415
+ "TrainerControl": {
1416
+ "args": {
1417
+ "should_epoch_stop": false,
1418
+ "should_evaluate": false,
1419
+ "should_log": false,
1420
+ "should_save": true,
1421
+ "should_training_stop": true
1422
+ },
1423
+ "attributes": {}
1424
+ }
1425
+ },
1426
+ "total_flos": 1.4259794694102843e+19,
1427
+ "train_batch_size": 8,
1428
+ "trial_name": null,
1429
+ "trial_params": null
1430
+ }
qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a54c455d930e64e0c7749ab36bc828670885258aff50943d4502631454d52843
3
+ size 6648
qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_eval_loss.png ADDED
qwen2-7b-instruct-traininfer-instruct0625/ckpt/training_loss.png ADDED
qwen2-7b-instruct-traininfer-instruct0625/ckpt/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2-7b-instruct-traininfer-instruct0625/merge_weight.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+
5
+ model_id = "Qwen/Qwen2-7B-Instruct"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
7
+ output_path = "Qwen2-7B-Instruct-Merged"
8
+
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ model_id,
11
+ device_map="auto",
12
+ torch_dtype=torch.float16,
13
+ trust_remote_code=True,
14
+ ).eval()
15
+
16
+ model = PeftModel.from_pretrained(model, model_id="./ckpt")
17
+ merged_model = model.merge_and_unload()
18
+ merged_model.save_pretrained(
19
+ output_path, max_shard_size="2048MB", safe_serialization=True
20
+ )
qwen2-7b-instruct-traininfer-instruct0625/qwen2_7b_instruct_lora_sft.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config for https://github.com/hiyouga/LLaMA-Factory
2
+ ### model
3
+ model_name_or_path: Qwen/Qwen2-7B-Instruct
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+
10
+ ### dataset
11
+ dataset_dir: data
12
+ dataset: llm-complex-reasoning-train-qwen2-72b-instruct-correct,Infinity-Instruct-0625
13
+ template: qwen
14
+ cutoff_len: 2048
15
+ max_samples: 128000
16
+ overwrite_cache: false
17
+ preprocessing_num_workers: 8
18
+ # deepspeed: ./LLaMA-Factory/examples/deepspeed/ds_z2_config.json
19
+
20
+ ### output
21
+ output_dir: output/qwen2-7b-instruct/sft-lora
22
+ logging_steps: 5
23
+ save_steps: 300
24
+ plot_loss: true
25
+ overwrite_output_dir: true
26
+
27
+ ### train
28
+ per_device_train_batch_size: 8
29
+ gradient_accumulation_steps: 8
30
+ learning_rate: 5.0e-5
31
+ num_train_epochs: 2.0
32
+ lr_scheduler_type: cosine
33
+ warmup_ratio: 0.1
34
+ bf16: true
35
+ ddp_timeout: 180000000
36
+
37
+ ### eval
38
+ val_size: 0.1
39
+ per_device_eval_batch_size: 4
40
+ eval_strategy: steps
41
+ eval_steps: 100
qwen2-7b-instruct-traininfer-instruct0625/readme.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Finetune with `LLaMA-Factory`
2
+
3
+ `dataset_info.json` add
4
+
5
+ ```json
6
+ {
7
+ "llm-complex-reasoning-train-qwen2-72b-instruct-correct": {
8
+ "file_name": "/data/songhaoyang/llm-complex-reasoning/data/llm-complex-reasoning-train-qwen2-72b-instruct-correct/train.jsonl",
9
+ "formatting": "sharegpt",
10
+ "columns": {
11
+ "messages": "messages"
12
+ },
13
+ "tags": {
14
+ "role_tag": "role",
15
+ "content_tag": "content",
16
+ "user_tag": "user",
17
+ "assistant_tag": "assistant",
18
+ "system_tag": "system"
19
+ }
20
+ },
21
+ "Infinity-Instruct-0625": {
22
+ "hf_hub_url": "BAAI/Infinity-Instruct",
23
+ "subset": "0625",
24
+ "formatting": "sharegpt"
25
+ }
26
+ }
27
+ ```
28
+
29
+ Usage: `llamafactory-cli train qwen2_7b_instruct_lora_sft.yaml`