loubnabnl HF staff commited on
Commit
1717970
0 Parent(s):

Duplicate from HuggingFaceTB/smollm2-360M-8k-lc100k-dpo-ultaf-ep2

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: loubnabnl/smollm2-360M-8k-lc100k-mix1-ep2
3
+ tags:
4
+ - alignment-handbook
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ - generated_from_trainer
11
+ datasets:
12
+ - HuggingFaceH4/ultrafeedback_binarized
13
+ model-index:
14
+ - name: smollm2-360M-8k-lc100k-dpo-ultaf-ep2
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/3nedrmyg)
22
+ # smollm2-360M-8k-lc100k-dpo-ultaf-ep2
23
+
24
+ This model is a fine-tuned version of [loubnabnl/smollm2-360M-8k-lc100k-mix1-ep2](https://huggingface.co/loubnabnl/smollm2-360M-8k-lc100k-mix1-ep2) on the HuggingFaceH4/ultrafeedback_binarized dataset.
25
+ It achieves the following results on the evaluation set:
26
+ - Loss: 0.6348
27
+ - Rewards/chosen: -0.0342
28
+ - Rewards/rejected: -0.3910
29
+ - Rewards/accuracies: 0.6190
30
+ - Rewards/margins: 0.3568
31
+ - Logps/rejected: -323.7198
32
+ - Logps/chosen: -375.6464
33
+ - Logits/rejected: -1.6969
34
+ - Logits/chosen: -1.6408
35
+
36
+ ## Model description
37
+
38
+ More information needed
39
+
40
+ ## Intended uses & limitations
41
+
42
+ More information needed
43
+
44
+ ## Training and evaluation data
45
+
46
+ More information needed
47
+
48
+ ## Training procedure
49
+
50
+ ### Training hyperparameters
51
+
52
+ The following hyperparameters were used during training:
53
+ - learning_rate: 1e-06
54
+ - train_batch_size: 2
55
+ - eval_batch_size: 4
56
+ - seed: 42
57
+ - distributed_type: multi-GPU
58
+ - num_devices: 8
59
+ - gradient_accumulation_steps: 8
60
+ - total_train_batch_size: 128
61
+ - total_eval_batch_size: 32
62
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
63
+ - lr_scheduler_type: cosine
64
+ - lr_scheduler_warmup_ratio: 0.1
65
+ - num_epochs: 2
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
70
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
71
+ | 0.7098 | 0.2094 | 100 | 0.7162 | -0.0109 | -0.0675 | 0.5278 | 0.0566 | -323.0727 | -375.5997 | -1.6983 | -1.6387 |
72
+ | 0.6825 | 0.4187 | 200 | 0.6842 | -0.0010 | -0.1880 | 0.5794 | 0.1870 | -323.3139 | -375.5800 | -1.6938 | -1.6358 |
73
+ | 0.663 | 0.6281 | 300 | 0.6617 | 0.0225 | -0.2389 | 0.6032 | 0.2614 | -323.4156 | -375.5330 | -1.6893 | -1.6317 |
74
+ | 0.6547 | 0.8375 | 400 | 0.6591 | 0.0001 | -0.3516 | 0.6389 | 0.3517 | -323.6410 | -375.5778 | -1.6980 | -1.6414 |
75
+ | 0.6456 | 1.0468 | 500 | 0.6430 | 0.0133 | -0.3566 | 0.6667 | 0.3699 | -323.6510 | -375.5514 | -1.6931 | -1.6365 |
76
+ | 0.6054 | 1.2562 | 600 | 0.6423 | -0.0329 | -0.3895 | 0.6349 | 0.3566 | -323.7167 | -375.6438 | -1.6991 | -1.6431 |
77
+ | 0.6129 | 1.4656 | 700 | 0.6431 | -0.0449 | -0.4183 | 0.6349 | 0.3735 | -323.7745 | -375.6677 | -1.6979 | -1.6414 |
78
+ | 0.5972 | 1.6750 | 800 | 0.6384 | -0.0695 | -0.4139 | 0.6429 | 0.3444 | -323.7656 | -375.7169 | -1.6965 | -1.6399 |
79
+ | 0.6207 | 1.8843 | 900 | 0.6362 | -0.0627 | -0.4222 | 0.6786 | 0.3595 | -323.7822 | -375.7033 | -1.6976 | -1.6407 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - Transformers 4.42.3
85
+ - Pytorch 2.1.2
86
+ - Datasets 2.20.0
87
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "eval_logits/chosen": -1.6407532691955566,
4
+ "eval_logits/rejected": -1.6968854665756226,
5
+ "eval_logps/chosen": -375.6463623046875,
6
+ "eval_logps/rejected": -323.7197570800781,
7
+ "eval_loss": 0.6348475217819214,
8
+ "eval_rewards/accuracies": 0.6190476417541504,
9
+ "eval_rewards/chosen": -0.034213583916425705,
10
+ "eval_rewards/margins": 0.3567626178264618,
11
+ "eval_rewards/rejected": -0.3909761905670166,
12
+ "eval_runtime": 22.3598,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 89.446,
15
+ "eval_steps_per_second": 2.818,
16
+ "total_flos": 0.0,
17
+ "train_loss": 0.6516540072998911,
18
+ "train_runtime": 5944.7081,
19
+ "train_samples": 61134,
20
+ "train_samples_per_second": 20.568,
21
+ "train_steps_per_second": 0.16
22
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "loubnabnl/smollm2-360M-8k-lc100k-mix1-ep2",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 960,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2560,
14
+ "is_llama_config": true,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 15,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 5,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.42.3",
30
+ "use_cache": true,
31
+ "vocab_size": 49152
32
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "eval_logits/chosen": -1.6407532691955566,
4
+ "eval_logits/rejected": -1.6968854665756226,
5
+ "eval_logps/chosen": -375.6463623046875,
6
+ "eval_logps/rejected": -323.7197570800781,
7
+ "eval_loss": 0.6348475217819214,
8
+ "eval_rewards/accuracies": 0.6190476417541504,
9
+ "eval_rewards/chosen": -0.034213583916425705,
10
+ "eval_rewards/margins": 0.3567626178264618,
11
+ "eval_rewards/rejected": -0.3909761905670166,
12
+ "eval_runtime": 22.3598,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 89.446,
15
+ "eval_steps_per_second": 2.818
16
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.42.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6bffe7435d7ddc10fd3b9a9efd429dafbacb1cb17015fb5562664e7532bf86e
3
+ size 723674912
runs/Oct31_09-01-58_ip-26-0-172-142/events.out.tfevents.1730365788.ip-26-0-172-142.451351.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a59625dee7fb61068e6a6ea53682ee0d91da3f92189019d5ceed597d21ebf27
3
+ size 78139
runs/Oct31_09-01-58_ip-26-0-172-142/events.out.tfevents.1730371773.ip-26-0-172-142.451351.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5691a0cdfdd383d29b69e85c4aaf632c695ce8fd31683bb9512f310fbf1250
3
+ size 828
runs/Oct31_09-19-57_ip-26-0-174-36/events.out.tfevents.1730366818.ip-26-0-174-36.3233632.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44f4f653d8d22e5db793f6743dc486f1d3f0919a66d5bbd69ebfd22b9e1f598b
3
+ size 38996
runs/Oct31_09-20-43_ip-26-0-161-142/events.out.tfevents.1730366856.ip-26-0-161-142.1301887.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684365d127b952fdfd215e337f4f8dc5bc4d37ef88fb608632d244ec6f86a449
3
+ size 37621
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|im_start|>",
143
+ "<|im_end|>"
144
+ ],
145
+ "bos_token": "<|im_start|>",
146
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
+ "clean_up_tokenization_spaces": false,
148
+ "eos_token": "<|im_end|>",
149
+ "model_max_length": 2048,
150
+ "pad_token": "<|im_end|>",
151
+ "tokenizer_class": "GPT2Tokenizer",
152
+ "unk_token": "<|endoftext|>",
153
+ "vocab_size": 49152
154
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.6516540072998911,
5
+ "train_runtime": 5944.7081,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 20.568,
8
+ "train_steps_per_second": 0.16
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9973828840617638,
5
+ "eval_steps": 100,
6
+ "global_step": 954,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002093692750588851,
13
+ "grad_norm": 37.29612828590162,
14
+ "learning_rate": 1.0416666666666666e-08,
15
+ "logits/chosen": -1.5181711912155151,
16
+ "logits/rejected": -1.4237494468688965,
17
+ "logps/chosen": -331.7369384765625,
18
+ "logps/rejected": -353.880859375,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.02093692750588851,
28
+ "grad_norm": 34.09183299219303,
29
+ "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -1.4457752704620361,
31
+ "logits/rejected": -1.4434000253677368,
32
+ "logps/chosen": -409.06182861328125,
33
+ "logps/rejected": -344.4476318359375,
34
+ "loss": 0.7192,
35
+ "rewards/accuracies": 0.3611111044883728,
36
+ "rewards/chosen": -0.012955614365637302,
37
+ "rewards/margins": 0.009454314596951008,
38
+ "rewards/rejected": -0.02240992709994316,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.04187385501177702,
43
+ "grad_norm": 33.674655892230355,
44
+ "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -1.2904545068740845,
46
+ "logits/rejected": -1.4261707067489624,
47
+ "logps/chosen": -361.19525146484375,
48
+ "logps/rejected": -322.1033630371094,
49
+ "loss": 0.7204,
50
+ "rewards/accuracies": 0.5249999761581421,
51
+ "rewards/chosen": 0.05613602325320244,
52
+ "rewards/margins": 0.04490422457456589,
53
+ "rewards/rejected": 0.011231804266571999,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.06281078251766553,
58
+ "grad_norm": 34.76216235748818,
59
+ "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -1.326378583908081,
61
+ "logits/rejected": -1.334900140762329,
62
+ "logps/chosen": -390.536376953125,
63
+ "logps/rejected": -312.28826904296875,
64
+ "loss": 0.7412,
65
+ "rewards/accuracies": 0.5062500238418579,
66
+ "rewards/chosen": -0.034933071583509445,
67
+ "rewards/margins": -0.05697251483798027,
68
+ "rewards/rejected": 0.022039445117115974,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.08374771002355404,
73
+ "grad_norm": 34.40909279988205,
74
+ "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -1.3933440446853638,
76
+ "logits/rejected": -1.4404706954956055,
77
+ "logps/chosen": -329.0964050292969,
78
+ "logps/rejected": -291.3802490234375,
79
+ "loss": 0.725,
80
+ "rewards/accuracies": 0.4937500059604645,
81
+ "rewards/chosen": 0.02061540260910988,
82
+ "rewards/margins": 0.004305871669203043,
83
+ "rewards/rejected": 0.016309529542922974,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.10468463752944256,
88
+ "grad_norm": 33.558772768765095,
89
+ "learning_rate": 5.208333333333334e-07,
90
+ "logits/chosen": -1.3897579908370972,
91
+ "logits/rejected": -1.4129403829574585,
92
+ "logps/chosen": -353.4942321777344,
93
+ "logps/rejected": -314.30755615234375,
94
+ "loss": 0.7195,
95
+ "rewards/accuracies": 0.512499988079071,
96
+ "rewards/chosen": 0.021332601085305214,
97
+ "rewards/margins": 0.062744140625,
98
+ "rewards/rejected": -0.04141153767704964,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.12562156503533106,
103
+ "grad_norm": 37.25091651222252,
104
+ "learning_rate": 6.249999999999999e-07,
105
+ "logits/chosen": -1.4710047245025635,
106
+ "logits/rejected": -1.4629329442977905,
107
+ "logps/chosen": -328.04437255859375,
108
+ "logps/rejected": -303.51922607421875,
109
+ "loss": 0.7164,
110
+ "rewards/accuracies": 0.518750011920929,
111
+ "rewards/chosen": 0.010426114313304424,
112
+ "rewards/margins": 0.02291293814778328,
113
+ "rewards/rejected": -0.01248682476580143,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.14655849254121958,
118
+ "grad_norm": 36.76489316948809,
119
+ "learning_rate": 7.291666666666666e-07,
120
+ "logits/chosen": -1.3989927768707275,
121
+ "logits/rejected": -1.3778417110443115,
122
+ "logps/chosen": -402.1053466796875,
123
+ "logps/rejected": -349.107421875,
124
+ "loss": 0.7271,
125
+ "rewards/accuracies": 0.543749988079071,
126
+ "rewards/chosen": 0.033832818269729614,
127
+ "rewards/margins": 0.09618775546550751,
128
+ "rewards/rejected": -0.062354933470487595,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.16749542004710807,
133
+ "grad_norm": 36.668613144227514,
134
+ "learning_rate": 8.333333333333333e-07,
135
+ "logits/chosen": -1.14809250831604,
136
+ "logits/rejected": -1.3004900217056274,
137
+ "logps/chosen": -393.3706970214844,
138
+ "logps/rejected": -302.55267333984375,
139
+ "loss": 0.7166,
140
+ "rewards/accuracies": 0.5625,
141
+ "rewards/chosen": 0.022442013025283813,
142
+ "rewards/margins": 0.0886504203081131,
143
+ "rewards/rejected": -0.06620840728282928,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.1884323475529966,
148
+ "grad_norm": 37.50656065265379,
149
+ "learning_rate": 9.374999999999999e-07,
150
+ "logits/chosen": -1.4117727279663086,
151
+ "logits/rejected": -1.4133552312850952,
152
+ "logps/chosen": -344.2930603027344,
153
+ "logps/rejected": -297.4874572753906,
154
+ "loss": 0.7401,
155
+ "rewards/accuracies": 0.4625000059604645,
156
+ "rewards/chosen": -0.002166506601497531,
157
+ "rewards/margins": -0.026871949434280396,
158
+ "rewards/rejected": 0.024705441668629646,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.2093692750588851,
163
+ "grad_norm": 40.39537923810557,
164
+ "learning_rate": 9.999463737538052e-07,
165
+ "logits/chosen": -1.4038206338882446,
166
+ "logits/rejected": -1.4591708183288574,
167
+ "logps/chosen": -382.0655822753906,
168
+ "logps/rejected": -325.4857482910156,
169
+ "loss": 0.7098,
170
+ "rewards/accuracies": 0.53125,
171
+ "rewards/chosen": 0.009398235008120537,
172
+ "rewards/margins": 0.012028699740767479,
173
+ "rewards/rejected": -0.002630466129630804,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.2093692750588851,
178
+ "eval_logits/chosen": -1.638710618019104,
179
+ "eval_logits/rejected": -1.6983067989349365,
180
+ "eval_logps/chosen": -375.59967041015625,
181
+ "eval_logps/rejected": -323.07269287109375,
182
+ "eval_loss": 0.7161566615104675,
183
+ "eval_rewards/accuracies": 0.5277777910232544,
184
+ "eval_rewards/chosen": -0.010860972106456757,
185
+ "eval_rewards/margins": 0.05658983066678047,
186
+ "eval_rewards/rejected": -0.06745080649852753,
187
+ "eval_runtime": 23.8466,
188
+ "eval_samples_per_second": 83.869,
189
+ "eval_steps_per_second": 2.642,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.23030620256477363,
194
+ "grad_norm": 31.68914195854589,
195
+ "learning_rate": 9.993432105822034e-07,
196
+ "logits/chosen": -1.3994014263153076,
197
+ "logits/rejected": -1.5090693235397339,
198
+ "logps/chosen": -380.85809326171875,
199
+ "logps/rejected": -315.415771484375,
200
+ "loss": 0.7244,
201
+ "rewards/accuracies": 0.518750011920929,
202
+ "rewards/chosen": 0.0006190292770043015,
203
+ "rewards/margins": 0.017153877764940262,
204
+ "rewards/rejected": -0.016534846276044846,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.2512431300706621,
209
+ "grad_norm": 32.5897566012542,
210
+ "learning_rate": 9.980706626858607e-07,
211
+ "logits/chosen": -1.3891736268997192,
212
+ "logits/rejected": -1.4548366069793701,
213
+ "logps/chosen": -330.4214782714844,
214
+ "logps/rejected": -290.9881286621094,
215
+ "loss": 0.7015,
216
+ "rewards/accuracies": 0.512499988079071,
217
+ "rewards/chosen": 0.002393510192632675,
218
+ "rewards/margins": 0.015673214569687843,
219
+ "rewards/rejected": -0.013279703445732594,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.2721800575765506,
224
+ "grad_norm": 37.434859226068475,
225
+ "learning_rate": 9.961304359538434e-07,
226
+ "logits/chosen": -1.3691463470458984,
227
+ "logits/rejected": -1.4777592420578003,
228
+ "logps/chosen": -375.37957763671875,
229
+ "logps/rejected": -303.9295959472656,
230
+ "loss": 0.6959,
231
+ "rewards/accuracies": 0.518750011920929,
232
+ "rewards/chosen": 0.023477336391806602,
233
+ "rewards/margins": 0.07401027530431747,
234
+ "rewards/rejected": -0.05053293704986572,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.29311698508243916,
239
+ "grad_norm": 32.710975845053845,
240
+ "learning_rate": 9.935251313189563e-07,
241
+ "logits/chosen": -1.2984201908111572,
242
+ "logits/rejected": -1.3567609786987305,
243
+ "logps/chosen": -403.44390869140625,
244
+ "logps/rejected": -315.7667541503906,
245
+ "loss": 0.694,
246
+ "rewards/accuracies": 0.5687500238418579,
247
+ "rewards/chosen": 0.1049453616142273,
248
+ "rewards/margins": 0.1717820167541504,
249
+ "rewards/rejected": -0.0668366402387619,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.31405391258832765,
254
+ "grad_norm": 33.59099498042739,
255
+ "learning_rate": 9.902582412711118e-07,
256
+ "logits/chosen": -1.3101998567581177,
257
+ "logits/rejected": -1.4272937774658203,
258
+ "logps/chosen": -362.3497619628906,
259
+ "logps/rejected": -304.9016418457031,
260
+ "loss": 0.6869,
261
+ "rewards/accuracies": 0.5625,
262
+ "rewards/chosen": 0.030804574489593506,
263
+ "rewards/margins": 0.16174563765525818,
264
+ "rewards/rejected": -0.13094103336334229,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.33499084009421615,
269
+ "grad_norm": 33.64536927205325,
270
+ "learning_rate": 9.86334145175542e-07,
271
+ "logits/chosen": -1.372686743736267,
272
+ "logits/rejected": -1.3871064186096191,
273
+ "logps/chosen": -333.40252685546875,
274
+ "logps/rejected": -309.45538330078125,
275
+ "loss": 0.698,
276
+ "rewards/accuracies": 0.550000011920929,
277
+ "rewards/chosen": 0.02940761111676693,
278
+ "rewards/margins": 0.14784011244773865,
279
+ "rewards/rejected": -0.11843249946832657,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.3559277676001047,
284
+ "grad_norm": 32.519654688788634,
285
+ "learning_rate": 9.817581034021272e-07,
286
+ "logits/chosen": -1.469236135482788,
287
+ "logits/rejected": -1.5464330911636353,
288
+ "logps/chosen": -325.42401123046875,
289
+ "logps/rejected": -279.6885986328125,
290
+ "loss": 0.6943,
291
+ "rewards/accuracies": 0.543749988079071,
292
+ "rewards/chosen": 0.008727139793336391,
293
+ "rewards/margins": 0.06822283565998077,
294
+ "rewards/rejected": -0.05949569493532181,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.3768646951059932,
299
+ "grad_norm": 31.217980241389647,
300
+ "learning_rate": 9.765362502737097e-07,
301
+ "logits/chosen": -1.392663598060608,
302
+ "logits/rejected": -1.3267390727996826,
303
+ "logps/chosen": -328.3432312011719,
304
+ "logps/rejected": -328.40679931640625,
305
+ "loss": 0.6854,
306
+ "rewards/accuracies": 0.5625,
307
+ "rewards/chosen": -0.0019484326476231217,
308
+ "rewards/margins": 0.09815844148397446,
309
+ "rewards/rejected": -0.10010688006877899,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.39780162261188173,
314
+ "grad_norm": 35.19062151474422,
315
+ "learning_rate": 9.706755858428485e-07,
316
+ "logits/chosen": -1.317312479019165,
317
+ "logits/rejected": -1.3148291110992432,
318
+ "logps/chosen": -339.4427490234375,
319
+ "logps/rejected": -339.53936767578125,
320
+ "loss": 0.6886,
321
+ "rewards/accuracies": 0.581250011920929,
322
+ "rewards/chosen": -0.017630616202950478,
323
+ "rewards/margins": 0.09675482660531998,
324
+ "rewards/rejected": -0.1143854409456253,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.4187385501177702,
329
+ "grad_norm": 38.736703713611334,
330
+ "learning_rate": 9.641839665080363e-07,
331
+ "logits/chosen": -1.3673009872436523,
332
+ "logits/rejected": -1.3681590557098389,
333
+ "logps/chosen": -341.8690490722656,
334
+ "logps/rejected": -310.6811828613281,
335
+ "loss": 0.6825,
336
+ "rewards/accuracies": 0.5562499761581421,
337
+ "rewards/chosen": -0.02894829586148262,
338
+ "rewards/margins": 0.07689115405082703,
339
+ "rewards/rejected": -0.10583944618701935,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.4187385501177702,
344
+ "eval_logits/chosen": -1.635803461074829,
345
+ "eval_logits/rejected": -1.69376540184021,
346
+ "eval_logps/chosen": -375.5799865722656,
347
+ "eval_logps/rejected": -323.3138732910156,
348
+ "eval_loss": 0.6841831207275391,
349
+ "eval_rewards/accuracies": 0.579365074634552,
350
+ "eval_rewards/chosen": -0.001013976288959384,
351
+ "eval_rewards/margins": 0.18699844181537628,
352
+ "eval_rewards/rejected": -0.18801240622997284,
353
+ "eval_runtime": 22.6938,
354
+ "eval_samples_per_second": 88.13,
355
+ "eval_steps_per_second": 2.776,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.4396754776236587,
360
+ "grad_norm": 29.329351201852436,
361
+ "learning_rate": 9.570700944819582e-07,
362
+ "logits/chosen": -1.1939313411712646,
363
+ "logits/rejected": -1.3856732845306396,
364
+ "logps/chosen": -384.10784912109375,
365
+ "logps/rejected": -317.24755859375,
366
+ "loss": 0.684,
367
+ "rewards/accuracies": 0.612500011920929,
368
+ "rewards/chosen": 0.0037221908569335938,
369
+ "rewards/margins": 0.16761450469493866,
370
+ "rewards/rejected": -0.16389232873916626,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.46061240512954726,
375
+ "grad_norm": 29.84515748563858,
376
+ "learning_rate": 9.493435061259129e-07,
377
+ "logits/chosen": -1.3803870677947998,
378
+ "logits/rejected": -1.3274301290512085,
379
+ "logps/chosen": -313.02777099609375,
380
+ "logps/rejected": -297.11004638671875,
381
+ "loss": 0.6846,
382
+ "rewards/accuracies": 0.5062500238418579,
383
+ "rewards/chosen": -0.023346154019236565,
384
+ "rewards/margins": 0.06815574318170547,
385
+ "rewards/rejected": -0.09150189161300659,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.48154933263543576,
390
+ "grad_norm": 31.662572343473325,
391
+ "learning_rate": 9.4101455916603e-07,
392
+ "logits/chosen": -1.3570753335952759,
393
+ "logits/rejected": -1.3131479024887085,
394
+ "logps/chosen": -331.6525573730469,
395
+ "logps/rejected": -329.0929260253906,
396
+ "loss": 0.6834,
397
+ "rewards/accuracies": 0.574999988079071,
398
+ "rewards/chosen": -0.025835394859313965,
399
+ "rewards/margins": 0.16715176403522491,
400
+ "rewards/rejected": -0.1929871290922165,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.5024862601413242,
405
+ "grad_norm": 34.26947764899909,
406
+ "learning_rate": 9.320944188084241e-07,
407
+ "logits/chosen": -1.3401970863342285,
408
+ "logits/rejected": -1.3203916549682617,
409
+ "logps/chosen": -341.75750732421875,
410
+ "logps/rejected": -315.9219970703125,
411
+ "loss": 0.6817,
412
+ "rewards/accuracies": 0.6312500238418579,
413
+ "rewards/chosen": 0.01570424810051918,
414
+ "rewards/margins": 0.13407179713249207,
415
+ "rewards/rejected": -0.11836756765842438,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.5234231876472127,
420
+ "grad_norm": 32.570026830574825,
421
+ "learning_rate": 9.225950427718974e-07,
422
+ "logits/chosen": -1.3138176202774048,
423
+ "logits/rejected": -1.4214763641357422,
424
+ "logps/chosen": -388.63800048828125,
425
+ "logps/rejected": -327.308349609375,
426
+ "loss": 0.662,
427
+ "rewards/accuracies": 0.6312500238418579,
428
+ "rewards/chosen": 0.034160248935222626,
429
+ "rewards/margins": 0.23999419808387756,
430
+ "rewards/rejected": -0.20583395659923553,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.5443601151531012,
435
+ "grad_norm": 31.875700958447943,
436
+ "learning_rate": 9.125291652582547e-07,
437
+ "logits/chosen": -1.171736717224121,
438
+ "logits/rejected": -1.2927379608154297,
439
+ "logps/chosen": -367.10638427734375,
440
+ "logps/rejected": -301.8586730957031,
441
+ "loss": 0.6732,
442
+ "rewards/accuracies": 0.59375,
443
+ "rewards/chosen": -0.03132578730583191,
444
+ "rewards/margins": 0.2260134220123291,
445
+ "rewards/rejected": -0.2573392391204834,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.5652970426589898,
450
+ "grad_norm": 29.95901746509434,
451
+ "learning_rate": 9.019102798817195e-07,
452
+ "logits/chosen": -1.4621995687484741,
453
+ "logits/rejected": -1.563369631767273,
454
+ "logps/chosen": -379.08746337890625,
455
+ "logps/rejected": -326.1041259765625,
456
+ "loss": 0.6799,
457
+ "rewards/accuracies": 0.574999988079071,
458
+ "rewards/chosen": 0.010546261444687843,
459
+ "rewards/margins": 0.17825904488563538,
460
+ "rewards/rejected": -0.16771277785301208,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.5862339701648783,
465
+ "grad_norm": 35.30660280902673,
466
+ "learning_rate": 8.90752621580335e-07,
467
+ "logits/chosen": -1.4563312530517578,
468
+ "logits/rejected": -1.5124019384384155,
469
+ "logps/chosen": -360.3949279785156,
470
+ "logps/rejected": -296.45318603515625,
471
+ "loss": 0.6904,
472
+ "rewards/accuracies": 0.53125,
473
+ "rewards/chosen": -0.05117585510015488,
474
+ "rewards/margins": 0.1499311625957489,
475
+ "rewards/rejected": -0.20110702514648438,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.6071708976707668,
480
+ "grad_norm": 30.02627471974457,
481
+ "learning_rate": 8.79071147533597e-07,
482
+ "logits/chosen": -1.3743504285812378,
483
+ "logits/rejected": -1.3512884378433228,
484
+ "logps/chosen": -341.1376647949219,
485
+ "logps/rejected": -331.7826843261719,
486
+ "loss": 0.6581,
487
+ "rewards/accuracies": 0.643750011920929,
488
+ "rewards/chosen": -0.004191230051219463,
489
+ "rewards/margins": 0.26077696681022644,
490
+ "rewards/rejected": -0.26496821641921997,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.6281078251766553,
495
+ "grad_norm": 32.675301753581984,
496
+ "learning_rate": 8.668815171119019e-07,
497
+ "logits/chosen": -1.2676564455032349,
498
+ "logits/rejected": -1.319390058517456,
499
+ "logps/chosen": -378.872314453125,
500
+ "logps/rejected": -322.4703369140625,
501
+ "loss": 0.663,
502
+ "rewards/accuracies": 0.6499999761581421,
503
+ "rewards/chosen": -0.022002484649419785,
504
+ "rewards/margins": 0.1741904467344284,
505
+ "rewards/rejected": -0.1961929351091385,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.6281078251766553,
510
+ "eval_logits/chosen": -1.6317138671875,
511
+ "eval_logits/rejected": -1.6892949342727661,
512
+ "eval_logps/chosen": -375.532958984375,
513
+ "eval_logps/rejected": -323.4156494140625,
514
+ "eval_loss": 0.6616633534431458,
515
+ "eval_rewards/accuracies": 0.60317462682724,
516
+ "eval_rewards/chosen": 0.022496730089187622,
517
+ "eval_rewards/margins": 0.2614184617996216,
518
+ "eval_rewards/rejected": -0.23892174661159515,
519
+ "eval_runtime": 22.3915,
520
+ "eval_samples_per_second": 89.32,
521
+ "eval_steps_per_second": 2.814,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.6490447526825438,
526
+ "grad_norm": 32.37940998012059,
527
+ "learning_rate": 8.54200070884685e-07,
528
+ "logits/chosen": -1.3149330615997314,
529
+ "logits/rejected": -1.4230272769927979,
530
+ "logps/chosen": -389.00384521484375,
531
+ "logps/rejected": -293.93487548828125,
532
+ "loss": 0.6471,
533
+ "rewards/accuracies": 0.606249988079071,
534
+ "rewards/chosen": -0.028895879164338112,
535
+ "rewards/margins": 0.29164841771125793,
536
+ "rewards/rejected": -0.3205442726612091,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.6699816801884323,
541
+ "grad_norm": 34.74994897373754,
542
+ "learning_rate": 8.410438087153911e-07,
543
+ "logits/chosen": -1.2203506231307983,
544
+ "logits/rejected": -1.28737211227417,
545
+ "logps/chosen": -353.5164489746094,
546
+ "logps/rejected": -297.2350769042969,
547
+ "loss": 0.6689,
548
+ "rewards/accuracies": 0.5687500238418579,
549
+ "rewards/chosen": -0.03598624840378761,
550
+ "rewards/margins": 0.1709606647491455,
551
+ "rewards/rejected": -0.20694692432880402,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.6909186076943209,
556
+ "grad_norm": 27.89021442930517,
557
+ "learning_rate": 8.274303669726426e-07,
558
+ "logits/chosen": -1.2289667129516602,
559
+ "logits/rejected": -1.3027209043502808,
560
+ "logps/chosen": -351.02435302734375,
561
+ "logps/rejected": -311.8251953125,
562
+ "loss": 0.6675,
563
+ "rewards/accuracies": 0.668749988079071,
564
+ "rewards/chosen": -0.005795622244477272,
565
+ "rewards/margins": 0.3018319308757782,
566
+ "rewards/rejected": -0.30762752890586853,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.7118555352002094,
571
+ "grad_norm": 32.72596936653152,
572
+ "learning_rate": 8.133779948881513e-07,
573
+ "logits/chosen": -1.3401914834976196,
574
+ "logits/rejected": -1.387083888053894,
575
+ "logps/chosen": -355.665771484375,
576
+ "logps/rejected": -318.4053955078125,
577
+ "loss": 0.6557,
578
+ "rewards/accuracies": 0.65625,
579
+ "rewards/chosen": -0.04015441983938217,
580
+ "rewards/margins": 0.22958505153656006,
581
+ "rewards/rejected": -0.2697394788265228,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.7327924627060979,
586
+ "grad_norm": 35.42610267799739,
587
+ "learning_rate": 7.989055300930704e-07,
588
+ "logits/chosen": -1.3034248352050781,
589
+ "logits/rejected": -1.3343088626861572,
590
+ "logps/chosen": -340.1492614746094,
591
+ "logps/rejected": -289.6612854003906,
592
+ "loss": 0.6554,
593
+ "rewards/accuracies": 0.581250011920929,
594
+ "rewards/chosen": -0.1225026398897171,
595
+ "rewards/margins": 0.12164878845214844,
596
+ "rewards/rejected": -0.24415142834186554,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.7537293902119864,
601
+ "grad_norm": 29.02918447808193,
602
+ "learning_rate": 7.840323733655778e-07,
603
+ "logits/chosen": -1.2503941059112549,
604
+ "logits/rejected": -1.4153048992156982,
605
+ "logps/chosen": -406.9992980957031,
606
+ "logps/rejected": -322.87994384765625,
607
+ "loss": 0.6565,
608
+ "rewards/accuracies": 0.675000011920929,
609
+ "rewards/chosen": 0.024089232087135315,
610
+ "rewards/margins": 0.31749090552330017,
611
+ "rewards/rejected": -0.29340168833732605,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.7746663177178749,
616
+ "grad_norm": 33.710516075595095,
617
+ "learning_rate": 7.687784626235447e-07,
618
+ "logits/chosen": -1.312897801399231,
619
+ "logits/rejected": -1.3840960264205933,
620
+ "logps/chosen": -369.8164978027344,
621
+ "logps/rejected": -291.6910705566406,
622
+ "loss": 0.643,
623
+ "rewards/accuracies": 0.643750011920929,
624
+ "rewards/chosen": 0.02891460619866848,
625
+ "rewards/margins": 0.3528975546360016,
626
+ "rewards/rejected": -0.3239828944206238,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.7956032452237635,
631
+ "grad_norm": 30.30908317538956,
632
+ "learning_rate": 7.531642461971514e-07,
633
+ "logits/chosen": -1.4397472143173218,
634
+ "logits/rejected": -1.5266616344451904,
635
+ "logps/chosen": -368.24224853515625,
636
+ "logps/rejected": -316.00567626953125,
637
+ "loss": 0.6553,
638
+ "rewards/accuracies": 0.7124999761581421,
639
+ "rewards/chosen": -0.07685438543558121,
640
+ "rewards/margins": 0.29695773124694824,
641
+ "rewards/rejected": -0.37381213903427124,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.816540172729652,
646
+ "grad_norm": 34.60440393056815,
647
+ "learning_rate": 7.372106554172801e-07,
648
+ "logits/chosen": -1.3900893926620483,
649
+ "logits/rejected": -1.348008394241333,
650
+ "logps/chosen": -369.2901306152344,
651
+ "logps/rejected": -338.5638427734375,
652
+ "loss": 0.661,
653
+ "rewards/accuracies": 0.6187499761581421,
654
+ "rewards/chosen": -0.0064844414591789246,
655
+ "rewards/margins": 0.22314131259918213,
656
+ "rewards/rejected": -0.22962574660778046,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.8374771002355405,
661
+ "grad_norm": 31.77115164859215,
662
+ "learning_rate": 7.209390765564318e-07,
663
+ "logits/chosen": -1.4996167421340942,
664
+ "logits/rejected": -1.4596645832061768,
665
+ "logps/chosen": -308.35113525390625,
666
+ "logps/rejected": -281.60693359375,
667
+ "loss": 0.6547,
668
+ "rewards/accuracies": 0.65625,
669
+ "rewards/chosen": -0.04348212480545044,
670
+ "rewards/margins": 0.27009111642837524,
671
+ "rewards/rejected": -0.3135732412338257,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.8374771002355405,
676
+ "eval_logits/chosen": -1.6414421796798706,
677
+ "eval_logits/rejected": -1.6979553699493408,
678
+ "eval_logps/chosen": -375.57781982421875,
679
+ "eval_logps/rejected": -323.6410217285156,
680
+ "eval_loss": 0.6590762734413147,
681
+ "eval_rewards/accuracies": 0.6388888955116272,
682
+ "eval_rewards/chosen": 7.240355625981465e-05,
683
+ "eval_rewards/margins": 0.3516833484172821,
684
+ "eval_rewards/rejected": -0.3516109585762024,
685
+ "eval_runtime": 22.0937,
686
+ "eval_samples_per_second": 90.524,
687
+ "eval_steps_per_second": 2.851,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.8584140277414289,
692
+ "grad_norm": 31.789442088224007,
693
+ "learning_rate": 7.043713221597773e-07,
694
+ "logits/chosen": -1.2454522848129272,
695
+ "logits/rejected": -1.4237592220306396,
696
+ "logps/chosen": -399.0449523925781,
697
+ "logps/rejected": -326.0782775878906,
698
+ "loss": 0.6462,
699
+ "rewards/accuracies": 0.612500011920929,
700
+ "rewards/chosen": 0.014972883276641369,
701
+ "rewards/margins": 0.2356092631816864,
702
+ "rewards/rejected": -0.22063639760017395,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.8793509552473174,
707
+ "grad_norm": 32.68488619173213,
708
+ "learning_rate": 6.875296018047809e-07,
709
+ "logits/chosen": -1.5021634101867676,
710
+ "logits/rejected": -1.4257535934448242,
711
+ "logps/chosen": -349.61553955078125,
712
+ "logps/rejected": -340.30975341796875,
713
+ "loss": 0.6371,
714
+ "rewards/accuracies": 0.6312500238418579,
715
+ "rewards/chosen": -0.10846195369958878,
716
+ "rewards/margins": 0.26752999424934387,
717
+ "rewards/rejected": -0.37599191069602966,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.9002878827532059,
722
+ "grad_norm": 29.283057753738706,
723
+ "learning_rate": 6.704364923285857e-07,
724
+ "logits/chosen": -1.3049055337905884,
725
+ "logits/rejected": -1.4615538120269775,
726
+ "logps/chosen": -387.3922119140625,
727
+ "logps/rejected": -299.621337890625,
728
+ "loss": 0.6539,
729
+ "rewards/accuracies": 0.5687500238418579,
730
+ "rewards/chosen": -0.06501199305057526,
731
+ "rewards/margins": 0.30248281359672546,
732
+ "rewards/rejected": -0.3674947917461395,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.9212248102590945,
737
+ "grad_norm": 30.29029590057011,
738
+ "learning_rate": 6.531149075630796e-07,
739
+ "logits/chosen": -1.2950228452682495,
740
+ "logits/rejected": -1.3765883445739746,
741
+ "logps/chosen": -359.27691650390625,
742
+ "logps/rejected": -293.13104248046875,
743
+ "loss": 0.6508,
744
+ "rewards/accuracies": 0.5874999761581421,
745
+ "rewards/chosen": -0.05934730917215347,
746
+ "rewards/margins": 0.2306128442287445,
747
+ "rewards/rejected": -0.28996017575263977,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.942161737764983,
752
+ "grad_norm": 31.040736224709473,
753
+ "learning_rate": 6.355880676182085e-07,
754
+ "logits/chosen": -1.4344260692596436,
755
+ "logits/rejected": -1.3737527132034302,
756
+ "logps/chosen": -357.5885314941406,
757
+ "logps/rejected": -332.05499267578125,
758
+ "loss": 0.6653,
759
+ "rewards/accuracies": 0.675000011920929,
760
+ "rewards/chosen": -0.04119770601391792,
761
+ "rewards/margins": 0.2782004475593567,
762
+ "rewards/rejected": -0.31939810514450073,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.9630986652708715,
767
+ "grad_norm": 31.5228211898507,
768
+ "learning_rate": 6.178794677547137e-07,
769
+ "logits/chosen": -1.2749378681182861,
770
+ "logits/rejected": -1.4267973899841309,
771
+ "logps/chosen": -346.6052551269531,
772
+ "logps/rejected": -290.1454162597656,
773
+ "loss": 0.6415,
774
+ "rewards/accuracies": 0.6625000238418579,
775
+ "rewards/chosen": 0.026611875742673874,
776
+ "rewards/margins": 0.3765586316585541,
777
+ "rewards/rejected": -0.3499467074871063,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.98403559277676,
782
+ "grad_norm": 31.660312290338712,
783
+ "learning_rate": 6.000128468880222e-07,
784
+ "logits/chosen": -1.3701679706573486,
785
+ "logits/rejected": -1.390453815460205,
786
+ "logps/chosen": -369.3509826660156,
787
+ "logps/rejected": -323.2626037597656,
788
+ "loss": 0.6358,
789
+ "rewards/accuracies": 0.6000000238418579,
790
+ "rewards/chosen": -0.04081202298402786,
791
+ "rewards/margins": 0.33853498101234436,
792
+ "rewards/rejected": -0.37934696674346924,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 1.0049725202826485,
797
+ "grad_norm": 27.48266116042723,
798
+ "learning_rate": 5.820121557655108e-07,
799
+ "logits/chosen": -1.2848116159439087,
800
+ "logits/rejected": -1.3119057416915894,
801
+ "logps/chosen": -359.86102294921875,
802
+ "logps/rejected": -313.0788269042969,
803
+ "loss": 0.6459,
804
+ "rewards/accuracies": 0.625,
805
+ "rewards/chosen": -0.048723649233579636,
806
+ "rewards/margins": 0.27731940150260925,
807
+ "rewards/rejected": -0.3260430693626404,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 1.025909447788537,
812
+ "grad_norm": 31.66810532106133,
813
+ "learning_rate": 5.639015248598023e-07,
814
+ "logits/chosen": -1.34721839427948,
815
+ "logits/rejected": -1.449410080909729,
816
+ "logps/chosen": -361.23046875,
817
+ "logps/rejected": -296.3492126464844,
818
+ "loss": 0.6519,
819
+ "rewards/accuracies": 0.625,
820
+ "rewards/chosen": -0.09297003597021103,
821
+ "rewards/margins": 0.209228515625,
822
+ "rewards/rejected": -0.302198588848114,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 1.0468463752944255,
827
+ "grad_norm": 31.533076924353868,
828
+ "learning_rate": 5.457052320211339e-07,
829
+ "logits/chosen": -1.3988004922866821,
830
+ "logits/rejected": -1.4485900402069092,
831
+ "logps/chosen": -370.951904296875,
832
+ "logps/rejected": -314.5948181152344,
833
+ "loss": 0.6456,
834
+ "rewards/accuracies": 0.550000011920929,
835
+ "rewards/chosen": -0.10979924350976944,
836
+ "rewards/margins": 0.23507650196552277,
837
+ "rewards/rejected": -0.3448757231235504,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 1.0468463752944255,
842
+ "eval_logits/chosen": -1.636523962020874,
843
+ "eval_logits/rejected": -1.6931382417678833,
844
+ "eval_logps/chosen": -375.5513916015625,
845
+ "eval_logps/rejected": -323.6509704589844,
846
+ "eval_loss": 0.6429626941680908,
847
+ "eval_rewards/accuracies": 0.6666666865348816,
848
+ "eval_rewards/chosen": 0.01327058020979166,
849
+ "eval_rewards/margins": 0.36985495686531067,
850
+ "eval_rewards/rejected": -0.356584370136261,
851
+ "eval_runtime": 22.4458,
852
+ "eval_samples_per_second": 89.104,
853
+ "eval_steps_per_second": 2.807,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 1.067783302800314,
858
+ "grad_norm": 30.307455199305462,
859
+ "learning_rate": 5.274476699321637e-07,
860
+ "logits/chosen": -1.4101717472076416,
861
+ "logits/rejected": -1.420190453529358,
862
+ "logps/chosen": -327.46893310546875,
863
+ "logps/rejected": -302.391357421875,
864
+ "loss": 0.6345,
865
+ "rewards/accuracies": 0.6312500238418579,
866
+ "rewards/chosen": -0.04776526987552643,
867
+ "rewards/margins": 0.2989855408668518,
868
+ "rewards/rejected": -0.3467508554458618,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 1.0887202303062025,
873
+ "grad_norm": 29.722389429702496,
874
+ "learning_rate": 5.091533134088387e-07,
875
+ "logits/chosen": -1.4273402690887451,
876
+ "logits/rejected": -1.4231868982315063,
877
+ "logps/chosen": -323.42376708984375,
878
+ "logps/rejected": -304.4673767089844,
879
+ "loss": 0.6284,
880
+ "rewards/accuracies": 0.6499999761581421,
881
+ "rewards/chosen": -0.0829726904630661,
882
+ "rewards/margins": 0.24710920453071594,
883
+ "rewards/rejected": -0.33008190989494324,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 1.109657157812091,
888
+ "grad_norm": 28.885339598508185,
889
+ "learning_rate": 4.908466865911614e-07,
890
+ "logits/chosen": -1.4464526176452637,
891
+ "logits/rejected": -1.5408886671066284,
892
+ "logps/chosen": -344.06689453125,
893
+ "logps/rejected": -292.1816101074219,
894
+ "loss": 0.6328,
895
+ "rewards/accuracies": 0.643750011920929,
896
+ "rewards/chosen": -0.03225661441683769,
897
+ "rewards/margins": 0.38520506024360657,
898
+ "rewards/rejected": -0.41746169328689575,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 1.1305940853179797,
903
+ "grad_norm": 29.441675570164428,
904
+ "learning_rate": 4.7255233006783624e-07,
905
+ "logits/chosen": -1.3947404623031616,
906
+ "logits/rejected": -1.328045129776001,
907
+ "logps/chosen": -318.3937072753906,
908
+ "logps/rejected": -280.65435791015625,
909
+ "loss": 0.6372,
910
+ "rewards/accuracies": 0.675000011920929,
911
+ "rewards/chosen": -0.03985299542546272,
912
+ "rewards/margins": 0.33030450344085693,
913
+ "rewards/rejected": -0.37015751004219055,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 1.151531012823868,
918
+ "grad_norm": 29.84505193266847,
919
+ "learning_rate": 4.5429476797886617e-07,
920
+ "logits/chosen": -1.3157615661621094,
921
+ "logits/rejected": -1.463357925415039,
922
+ "logps/chosen": -367.59564208984375,
923
+ "logps/rejected": -284.3589172363281,
924
+ "loss": 0.6256,
925
+ "rewards/accuracies": 0.675000011920929,
926
+ "rewards/chosen": -0.04548700898885727,
927
+ "rewards/margins": 0.312529981136322,
928
+ "rewards/rejected": -0.3580169677734375,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 1.1724679403297567,
933
+ "grad_norm": 33.58402259379722,
934
+ "learning_rate": 4.3609847514019763e-07,
935
+ "logits/chosen": -1.3491549491882324,
936
+ "logits/rejected": -1.3917362689971924,
937
+ "logps/chosen": -355.3868408203125,
938
+ "logps/rejected": -311.2879943847656,
939
+ "loss": 0.6294,
940
+ "rewards/accuracies": 0.643750011920929,
941
+ "rewards/chosen": 0.005126100964844227,
942
+ "rewards/margins": 0.33152928948402405,
943
+ "rewards/rejected": -0.3264032006263733,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 1.193404867835645,
948
+ "grad_norm": 29.80795991678476,
949
+ "learning_rate": 4.179878442344892e-07,
950
+ "logits/chosen": -1.4194284677505493,
951
+ "logits/rejected": -1.371366262435913,
952
+ "logps/chosen": -326.87066650390625,
953
+ "logps/rejected": -321.23785400390625,
954
+ "loss": 0.6441,
955
+ "rewards/accuracies": 0.6812499761581421,
956
+ "rewards/chosen": 0.0237637497484684,
957
+ "rewards/margins": 0.33783239126205444,
958
+ "rewards/rejected": -0.31406864523887634,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 1.2143417953415336,
963
+ "grad_norm": 30.101844117489865,
964
+ "learning_rate": 3.9998715311197783e-07,
965
+ "logits/chosen": -1.452893853187561,
966
+ "logits/rejected": -1.460272192955017,
967
+ "logps/chosen": -346.24932861328125,
968
+ "logps/rejected": -292.27667236328125,
969
+ "loss": 0.6258,
970
+ "rewards/accuracies": 0.6937500238418579,
971
+ "rewards/chosen": -0.029639745131134987,
972
+ "rewards/margins": 0.4036487638950348,
973
+ "rewards/rejected": -0.43328848481178284,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 1.235278722847422,
978
+ "grad_norm": 28.648910640416673,
979
+ "learning_rate": 3.821205322452863e-07,
980
+ "logits/chosen": -1.2438939809799194,
981
+ "logits/rejected": -1.3377447128295898,
982
+ "logps/chosen": -380.3300476074219,
983
+ "logps/rejected": -317.4806823730469,
984
+ "loss": 0.6249,
985
+ "rewards/accuracies": 0.643750011920929,
986
+ "rewards/chosen": -0.0937061458826065,
987
+ "rewards/margins": 0.34427446126937866,
988
+ "rewards/rejected": -0.43798065185546875,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 1.2562156503533106,
993
+ "grad_norm": 34.234491670474995,
994
+ "learning_rate": 3.6441193238179146e-07,
995
+ "logits/chosen": -1.4130090475082397,
996
+ "logits/rejected": -1.340057611465454,
997
+ "logps/chosen": -377.28228759765625,
998
+ "logps/rejected": -362.57879638671875,
999
+ "loss": 0.6054,
1000
+ "rewards/accuracies": 0.59375,
1001
+ "rewards/chosen": -0.08141253888607025,
1002
+ "rewards/margins": 0.26778295636177063,
1003
+ "rewards/rejected": -0.3491954803466797,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 1.2562156503533106,
1008
+ "eval_logits/chosen": -1.6430615186691284,
1009
+ "eval_logits/rejected": -1.6991180181503296,
1010
+ "eval_logps/chosen": -375.643798828125,
1011
+ "eval_logps/rejected": -323.71673583984375,
1012
+ "eval_loss": 0.6423465609550476,
1013
+ "eval_rewards/accuracies": 0.6349206566810608,
1014
+ "eval_rewards/chosen": -0.032912444323301315,
1015
+ "eval_rewards/margins": 0.3565501272678375,
1016
+ "eval_rewards/rejected": -0.38946259021759033,
1017
+ "eval_runtime": 22.0916,
1018
+ "eval_samples_per_second": 90.532,
1019
+ "eval_steps_per_second": 2.852,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 1.2771525778591992,
1024
+ "grad_norm": 31.96916362076228,
1025
+ "learning_rate": 3.4688509243692034e-07,
1026
+ "logits/chosen": -1.3404086828231812,
1027
+ "logits/rejected": -1.401881217956543,
1028
+ "logps/chosen": -348.0643005371094,
1029
+ "logps/rejected": -274.1236267089844,
1030
+ "loss": 0.6089,
1031
+ "rewards/accuracies": 0.7437499761581421,
1032
+ "rewards/chosen": 0.05336350202560425,
1033
+ "rewards/margins": 0.5003852248191833,
1034
+ "rewards/rejected": -0.4470217823982239,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 1.2980895053650876,
1039
+ "grad_norm": 27.687942045456108,
1040
+ "learning_rate": 3.295635076714144e-07,
1041
+ "logits/chosen": -1.1929457187652588,
1042
+ "logits/rejected": -1.3312275409698486,
1043
+ "logps/chosen": -339.0008544921875,
1044
+ "logps/rejected": -282.86273193359375,
1045
+ "loss": 0.6173,
1046
+ "rewards/accuracies": 0.65625,
1047
+ "rewards/chosen": -0.04629793018102646,
1048
+ "rewards/margins": 0.3611483573913574,
1049
+ "rewards/rejected": -0.4074462950229645,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 1.3190264328709762,
1054
+ "grad_norm": 30.364814060074504,
1055
+ "learning_rate": 3.12470398195219e-07,
1056
+ "logits/chosen": -1.4581263065338135,
1057
+ "logits/rejected": -1.4359983205795288,
1058
+ "logps/chosen": -352.0130310058594,
1059
+ "logps/rejected": -319.5924987792969,
1060
+ "loss": 0.6124,
1061
+ "rewards/accuracies": 0.706250011920929,
1062
+ "rewards/chosen": 0.0860348716378212,
1063
+ "rewards/margins": 0.4933602213859558,
1064
+ "rewards/rejected": -0.4073253571987152,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 1.3399633603768648,
1069
+ "grad_norm": 28.207562684445328,
1070
+ "learning_rate": 2.956286778402226e-07,
1071
+ "logits/chosen": -1.3643245697021484,
1072
+ "logits/rejected": -1.3367928266525269,
1073
+ "logps/chosen": -329.8095397949219,
1074
+ "logps/rejected": -325.6297912597656,
1075
+ "loss": 0.6194,
1076
+ "rewards/accuracies": 0.668749988079071,
1077
+ "rewards/chosen": -0.03843813017010689,
1078
+ "rewards/margins": 0.4010621905326843,
1079
+ "rewards/rejected": -0.4395003318786621,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 1.3609002878827532,
1084
+ "grad_norm": 31.931835373695627,
1085
+ "learning_rate": 2.7906092344356826e-07,
1086
+ "logits/chosen": -1.4707378149032593,
1087
+ "logits/rejected": -1.4725669622421265,
1088
+ "logps/chosen": -321.7577819824219,
1089
+ "logps/rejected": -296.30072021484375,
1090
+ "loss": 0.6295,
1091
+ "rewards/accuracies": 0.5874999761581421,
1092
+ "rewards/chosen": -0.20691867172718048,
1093
+ "rewards/margins": 0.24088874459266663,
1094
+ "rewards/rejected": -0.4478074014186859,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 1.3818372153886418,
1099
+ "grad_norm": 33.26417313890622,
1100
+ "learning_rate": 2.6278934458271996e-07,
1101
+ "logits/chosen": -1.280631422996521,
1102
+ "logits/rejected": -1.3918514251708984,
1103
+ "logps/chosen": -319.20391845703125,
1104
+ "logps/rejected": -294.9554138183594,
1105
+ "loss": 0.6281,
1106
+ "rewards/accuracies": 0.59375,
1107
+ "rewards/chosen": -0.15276700258255005,
1108
+ "rewards/margins": 0.14049580693244934,
1109
+ "rewards/rejected": -0.293262779712677,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 1.4027741428945302,
1114
+ "grad_norm": 27.692247842703622,
1115
+ "learning_rate": 2.468357538028487e-07,
1116
+ "logits/chosen": -1.3904974460601807,
1117
+ "logits/rejected": -1.4726046323776245,
1118
+ "logps/chosen": -351.35614013671875,
1119
+ "logps/rejected": -299.1191101074219,
1120
+ "loss": 0.6141,
1121
+ "rewards/accuracies": 0.6812499761581421,
1122
+ "rewards/chosen": 0.012430467642843723,
1123
+ "rewards/margins": 0.4077607989311218,
1124
+ "rewards/rejected": -0.39533036947250366,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 1.4237110704004188,
1129
+ "grad_norm": 32.32896888329579,
1130
+ "learning_rate": 2.312215373764551e-07,
1131
+ "logits/chosen": -1.3210734128952026,
1132
+ "logits/rejected": -1.2972867488861084,
1133
+ "logps/chosen": -357.87725830078125,
1134
+ "logps/rejected": -346.83172607421875,
1135
+ "loss": 0.612,
1136
+ "rewards/accuracies": 0.6312500238418579,
1137
+ "rewards/chosen": -0.030674666166305542,
1138
+ "rewards/margins": 0.2985823154449463,
1139
+ "rewards/rejected": -0.32925695180892944,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 1.4446479979063072,
1144
+ "grad_norm": 26.559915261204598,
1145
+ "learning_rate": 2.1596762663442213e-07,
1146
+ "logits/chosen": -1.353945016860962,
1147
+ "logits/rejected": -1.331405758857727,
1148
+ "logps/chosen": -359.0257263183594,
1149
+ "logps/rejected": -302.020751953125,
1150
+ "loss": 0.6143,
1151
+ "rewards/accuracies": 0.637499988079071,
1152
+ "rewards/chosen": -0.09165488183498383,
1153
+ "rewards/margins": 0.32221120595932007,
1154
+ "rewards/rejected": -0.4138661324977875,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 1.4655849254121958,
1159
+ "grad_norm": 28.923951135790823,
1160
+ "learning_rate": 2.0109446990692963e-07,
1161
+ "logits/chosen": -1.3177390098571777,
1162
+ "logits/rejected": -1.2676212787628174,
1163
+ "logps/chosen": -382.9945373535156,
1164
+ "logps/rejected": -382.7451171875,
1165
+ "loss": 0.6129,
1166
+ "rewards/accuracies": 0.675000011920929,
1167
+ "rewards/chosen": 0.00983230210840702,
1168
+ "rewards/margins": 0.44890522956848145,
1169
+ "rewards/rejected": -0.43907293677330017,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 1.4655849254121958,
1174
+ "eval_logits/chosen": -1.6413902044296265,
1175
+ "eval_logits/rejected": -1.6979249715805054,
1176
+ "eval_logps/chosen": -375.667724609375,
1177
+ "eval_logps/rejected": -323.7745361328125,
1178
+ "eval_loss": 0.6431064009666443,
1179
+ "eval_rewards/accuracies": 0.6349206566810608,
1180
+ "eval_rewards/chosen": -0.044887345284223557,
1181
+ "eval_rewards/margins": 0.373458594083786,
1182
+ "eval_rewards/rejected": -0.41834595799446106,
1183
+ "eval_runtime": 22.4693,
1184
+ "eval_samples_per_second": 89.01,
1185
+ "eval_steps_per_second": 2.804,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 1.4865218529180844,
1190
+ "grad_norm": 29.793015750302608,
1191
+ "learning_rate": 1.8662200511184872e-07,
1192
+ "logits/chosen": -1.5391782522201538,
1193
+ "logits/rejected": -1.5692270994186401,
1194
+ "logps/chosen": -353.60504150390625,
1195
+ "logps/rejected": -327.60333251953125,
1196
+ "loss": 0.6413,
1197
+ "rewards/accuracies": 0.59375,
1198
+ "rewards/chosen": -0.08452965319156647,
1199
+ "rewards/margins": 0.3078867793083191,
1200
+ "rewards/rejected": -0.39241647720336914,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 1.5074587804239727,
1205
+ "grad_norm": 30.154215075312383,
1206
+ "learning_rate": 1.725696330273575e-07,
1207
+ "logits/chosen": -1.4344627857208252,
1208
+ "logits/rejected": -1.4114643335342407,
1209
+ "logps/chosen": -367.28668212890625,
1210
+ "logps/rejected": -330.210205078125,
1211
+ "loss": 0.6152,
1212
+ "rewards/accuracies": 0.675000011920929,
1213
+ "rewards/chosen": -0.0991826206445694,
1214
+ "rewards/margins": 0.3690223693847656,
1215
+ "rewards/rejected": -0.46820497512817383,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 1.5283957079298613,
1220
+ "grad_norm": 32.49028184373073,
1221
+ "learning_rate": 1.589561912846089e-07,
1222
+ "logits/chosen": -1.3968040943145752,
1223
+ "logits/rejected": -1.4468437433242798,
1224
+ "logps/chosen": -339.4754943847656,
1225
+ "logps/rejected": -298.7085266113281,
1226
+ "loss": 0.6139,
1227
+ "rewards/accuracies": 0.65625,
1228
+ "rewards/chosen": -0.04794057086110115,
1229
+ "rewards/margins": 0.3862064480781555,
1230
+ "rewards/rejected": -0.43414703011512756,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 1.54933263543575,
1235
+ "grad_norm": 29.612038431579933,
1236
+ "learning_rate": 1.4579992911531496e-07,
1237
+ "logits/chosen": -1.2432403564453125,
1238
+ "logits/rejected": -1.2390873432159424,
1239
+ "logps/chosen": -379.5582275390625,
1240
+ "logps/rejected": -332.1794128417969,
1241
+ "loss": 0.6205,
1242
+ "rewards/accuracies": 0.6625000238418579,
1243
+ "rewards/chosen": 0.019659820944070816,
1244
+ "rewards/margins": 0.3802977204322815,
1245
+ "rewards/rejected": -0.360637903213501,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 1.5702695629416383,
1250
+ "grad_norm": 28.119711308052274,
1251
+ "learning_rate": 1.3311848288809813e-07,
1252
+ "logits/chosen": -1.4195433855056763,
1253
+ "logits/rejected": -1.490561842918396,
1254
+ "logps/chosen": -359.9119873046875,
1255
+ "logps/rejected": -322.4441833496094,
1256
+ "loss": 0.6118,
1257
+ "rewards/accuracies": 0.6312500238418579,
1258
+ "rewards/chosen": -0.0712754875421524,
1259
+ "rewards/margins": 0.2602451741695404,
1260
+ "rewards/rejected": -0.3315206468105316,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 1.5912064904475267,
1265
+ "grad_norm": 29.268690125436255,
1266
+ "learning_rate": 1.209288524664029e-07,
1267
+ "logits/chosen": -1.229116439819336,
1268
+ "logits/rejected": -1.2306039333343506,
1269
+ "logps/chosen": -435.7142028808594,
1270
+ "logps/rejected": -397.52044677734375,
1271
+ "loss": 0.5971,
1272
+ "rewards/accuracies": 0.668749988079071,
1273
+ "rewards/chosen": 0.11046306788921356,
1274
+ "rewards/margins": 0.4868060052394867,
1275
+ "rewards/rejected": -0.3763429522514343,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 1.6121434179534153,
1280
+ "grad_norm": 28.29047720088543,
1281
+ "learning_rate": 1.0924737841966497e-07,
1282
+ "logits/chosen": -1.220485806465149,
1283
+ "logits/rejected": -1.4240710735321045,
1284
+ "logps/chosen": -396.7095947265625,
1285
+ "logps/rejected": -319.75958251953125,
1286
+ "loss": 0.6247,
1287
+ "rewards/accuracies": 0.668749988079071,
1288
+ "rewards/chosen": -0.055124759674072266,
1289
+ "rewards/margins": 0.39260005950927734,
1290
+ "rewards/rejected": -0.44772475957870483,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 1.633080345459304,
1295
+ "grad_norm": 29.17224056970089,
1296
+ "learning_rate": 9.808972011828054e-08,
1297
+ "logits/chosen": -1.2811758518218994,
1298
+ "logits/rejected": -1.3020236492156982,
1299
+ "logps/chosen": -384.5751647949219,
1300
+ "logps/rejected": -329.9205017089844,
1301
+ "loss": 0.626,
1302
+ "rewards/accuracies": 0.6937500238418579,
1303
+ "rewards/chosen": 0.14039060473442078,
1304
+ "rewards/margins": 0.46962863206863403,
1305
+ "rewards/rejected": -0.32923805713653564,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 1.6540172729651923,
1310
+ "grad_norm": 33.75145778470617,
1311
+ "learning_rate": 8.747083474174527e-08,
1312
+ "logits/chosen": -1.3885407447814941,
1313
+ "logits/rejected": -1.4309613704681396,
1314
+ "logps/chosen": -365.4181823730469,
1315
+ "logps/rejected": -322.75054931640625,
1316
+ "loss": 0.6123,
1317
+ "rewards/accuracies": 0.625,
1318
+ "rewards/chosen": -0.04203175753355026,
1319
+ "rewards/margins": 0.3509315550327301,
1320
+ "rewards/rejected": -0.39296332001686096,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 1.674954200471081,
1325
+ "grad_norm": 25.187206733853984,
1326
+ "learning_rate": 7.740495722810269e-08,
1327
+ "logits/chosen": -1.2112213373184204,
1328
+ "logits/rejected": -1.3428361415863037,
1329
+ "logps/chosen": -420.00830078125,
1330
+ "logps/rejected": -358.3287353515625,
1331
+ "loss": 0.5972,
1332
+ "rewards/accuracies": 0.6937500238418579,
1333
+ "rewards/chosen": 0.01959555223584175,
1334
+ "rewards/margins": 0.5057805776596069,
1335
+ "rewards/rejected": -0.4861849844455719,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 1.674954200471081,
1340
+ "eval_logits/chosen": -1.639930009841919,
1341
+ "eval_logits/rejected": -1.6965380907058716,
1342
+ "eval_logps/chosen": -375.7169494628906,
1343
+ "eval_logps/rejected": -323.76556396484375,
1344
+ "eval_loss": 0.6384284496307373,
1345
+ "eval_rewards/accuracies": 0.6428571343421936,
1346
+ "eval_rewards/chosen": -0.06950785219669342,
1347
+ "eval_rewards/margins": 0.34436169266700745,
1348
+ "eval_rewards/rejected": -0.41386955976486206,
1349
+ "eval_runtime": 21.8454,
1350
+ "eval_samples_per_second": 91.552,
1351
+ "eval_steps_per_second": 2.884,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 1.6958911279769695,
1356
+ "grad_norm": 30.26944195604407,
1357
+ "learning_rate": 6.790558119157597e-08,
1358
+ "logits/chosen": -1.3412492275238037,
1359
+ "logits/rejected": -1.3922173976898193,
1360
+ "logps/chosen": -375.84429931640625,
1361
+ "logps/rejected": -326.71417236328125,
1362
+ "loss": 0.6295,
1363
+ "rewards/accuracies": 0.699999988079071,
1364
+ "rewards/chosen": -0.015008327551186085,
1365
+ "rewards/margins": 0.4259034991264343,
1366
+ "rewards/rejected": -0.44091176986694336,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 1.7168280554828579,
1371
+ "grad_norm": 30.885265100170944,
1372
+ "learning_rate": 5.898544083397e-08,
1373
+ "logits/chosen": -1.4171245098114014,
1374
+ "logits/rejected": -1.5250798463821411,
1375
+ "logps/chosen": -388.115478515625,
1376
+ "logps/rejected": -320.20819091796875,
1377
+ "loss": 0.6055,
1378
+ "rewards/accuracies": 0.6312500238418579,
1379
+ "rewards/chosen": -0.05425529554486275,
1380
+ "rewards/margins": 0.4603540003299713,
1381
+ "rewards/rejected": -0.5146092176437378,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 1.7377649829887463,
1386
+ "grad_norm": 32.971600431805705,
1387
+ "learning_rate": 5.065649387408705e-08,
1388
+ "logits/chosen": -1.3083207607269287,
1389
+ "logits/rejected": -1.2884520292282104,
1390
+ "logps/chosen": -338.13909912109375,
1391
+ "logps/rejected": -327.7836608886719,
1392
+ "loss": 0.6083,
1393
+ "rewards/accuracies": 0.668749988079071,
1394
+ "rewards/chosen": -0.11997182667255402,
1395
+ "rewards/margins": 0.3927108347415924,
1396
+ "rewards/rejected": -0.5126826167106628,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 1.7587019104946349,
1401
+ "grad_norm": 29.20392566444622,
1402
+ "learning_rate": 4.292990551804171e-08,
1403
+ "logits/chosen": -1.5097484588623047,
1404
+ "logits/rejected": -1.4066120386123657,
1405
+ "logps/chosen": -314.53338623046875,
1406
+ "logps/rejected": -309.9684143066406,
1407
+ "loss": 0.6346,
1408
+ "rewards/accuracies": 0.6187499761581421,
1409
+ "rewards/chosen": -0.06001155450940132,
1410
+ "rewards/margins": 0.30454209446907043,
1411
+ "rewards/rejected": -0.36455363035202026,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 1.7796388380005235,
1416
+ "grad_norm": 30.280553572438347,
1417
+ "learning_rate": 3.581603349196371e-08,
1418
+ "logits/chosen": -1.3757822513580322,
1419
+ "logits/rejected": -1.3954650163650513,
1420
+ "logps/chosen": -331.03826904296875,
1421
+ "logps/rejected": -323.1974182128906,
1422
+ "loss": 0.6233,
1423
+ "rewards/accuracies": 0.625,
1424
+ "rewards/chosen": -0.10794013738632202,
1425
+ "rewards/margins": 0.2812719941139221,
1426
+ "rewards/rejected": -0.38921213150024414,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 1.8005757655064119,
1431
+ "grad_norm": 28.100566523523845,
1432
+ "learning_rate": 2.9324414157151367e-08,
1433
+ "logits/chosen": -1.3457200527191162,
1434
+ "logits/rejected": -1.4598686695098877,
1435
+ "logps/chosen": -356.97216796875,
1436
+ "logps/rejected": -286.18865966796875,
1437
+ "loss": 0.6215,
1438
+ "rewards/accuracies": 0.612500011920929,
1439
+ "rewards/chosen": -0.05811784416437149,
1440
+ "rewards/margins": 0.35237258672714233,
1441
+ "rewards/rejected": -0.4104904234409332,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 1.8215126930123005,
1446
+ "grad_norm": 26.46890964804442,
1447
+ "learning_rate": 2.3463749726290284e-08,
1448
+ "logits/chosen": -1.4204833507537842,
1449
+ "logits/rejected": -1.5046513080596924,
1450
+ "logps/chosen": -404.6219177246094,
1451
+ "logps/rejected": -336.10150146484375,
1452
+ "loss": 0.6261,
1453
+ "rewards/accuracies": 0.6499999761581421,
1454
+ "rewards/chosen": 0.02932531200349331,
1455
+ "rewards/margins": 0.39560848474502563,
1456
+ "rewards/rejected": -0.366283118724823,
1457
+ "step": 870
1458
+ },
1459
+ {
1460
+ "epoch": 1.842449620518189,
1461
+ "grad_norm": 31.701066186197266,
1462
+ "learning_rate": 1.824189659787284e-08,
1463
+ "logits/chosen": -1.3159443140029907,
1464
+ "logits/rejected": -1.3131808042526245,
1465
+ "logps/chosen": -328.07098388671875,
1466
+ "logps/rejected": -308.87701416015625,
1467
+ "loss": 0.6266,
1468
+ "rewards/accuracies": 0.606249988079071,
1469
+ "rewards/chosen": -0.0714392215013504,
1470
+ "rewards/margins": 0.3882763683795929,
1471
+ "rewards/rejected": -0.45971551537513733,
1472
+ "step": 880
1473
+ },
1474
+ {
1475
+ "epoch": 1.8633865480240774,
1476
+ "grad_norm": 31.12356616666238,
1477
+ "learning_rate": 1.3665854824458035e-08,
1478
+ "logits/chosen": -1.3845950365066528,
1479
+ "logits/rejected": -1.434497594833374,
1480
+ "logps/chosen": -376.0118103027344,
1481
+ "logps/rejected": -331.4889831542969,
1482
+ "loss": 0.6095,
1483
+ "rewards/accuracies": 0.731249988079071,
1484
+ "rewards/chosen": -0.061675846576690674,
1485
+ "rewards/margins": 0.3783223330974579,
1486
+ "rewards/rejected": -0.43999820947647095,
1487
+ "step": 890
1488
+ },
1489
+ {
1490
+ "epoch": 1.8843234755299658,
1491
+ "grad_norm": 34.05071913262595,
1492
+ "learning_rate": 9.741758728888217e-09,
1493
+ "logits/chosen": -1.2828714847564697,
1494
+ "logits/rejected": -1.3924423456192017,
1495
+ "logps/chosen": -401.4546203613281,
1496
+ "logps/rejected": -317.35443115234375,
1497
+ "loss": 0.6207,
1498
+ "rewards/accuracies": 0.6312500238418579,
1499
+ "rewards/chosen": -0.004245785064995289,
1500
+ "rewards/margins": 0.4051188826560974,
1501
+ "rewards/rejected": -0.40936464071273804,
1502
+ "step": 900
1503
+ },
1504
+ {
1505
+ "epoch": 1.8843234755299658,
1506
+ "eval_logits/chosen": -1.6407170295715332,
1507
+ "eval_logits/rejected": -1.6976144313812256,
1508
+ "eval_logps/chosen": -375.70330810546875,
1509
+ "eval_logps/rejected": -323.78216552734375,
1510
+ "eval_loss": 0.6362326145172119,
1511
+ "eval_rewards/accuracies": 0.6785714030265808,
1512
+ "eval_rewards/chosen": -0.06266693770885468,
1513
+ "eval_rewards/margins": 0.3595210015773773,
1514
+ "eval_rewards/rejected": -0.4221878945827484,
1515
+ "eval_runtime": 22.2411,
1516
+ "eval_samples_per_second": 89.923,
1517
+ "eval_steps_per_second": 2.833,
1518
+ "step": 900
1519
+ },
1520
+ {
1521
+ "epoch": 1.9052604030358546,
1522
+ "grad_norm": 28.324746122618627,
1523
+ "learning_rate": 6.474868681043577e-09,
1524
+ "logits/chosen": -1.232722282409668,
1525
+ "logits/rejected": -1.3370740413665771,
1526
+ "logps/chosen": -330.0971984863281,
1527
+ "logps/rejected": -274.8087463378906,
1528
+ "loss": 0.6315,
1529
+ "rewards/accuracies": 0.6875,
1530
+ "rewards/chosen": -0.04480002075433731,
1531
+ "rewards/margins": 0.4823727011680603,
1532
+ "rewards/rejected": -0.5271727442741394,
1533
+ "step": 910
1534
+ },
1535
+ {
1536
+ "epoch": 1.926197330541743,
1537
+ "grad_norm": 32.98699943344935,
1538
+ "learning_rate": 3.869564046156459e-09,
1539
+ "logits/chosen": -1.3717395067214966,
1540
+ "logits/rejected": -1.4535510540008545,
1541
+ "logps/chosen": -373.8208923339844,
1542
+ "logps/rejected": -311.4999084472656,
1543
+ "loss": 0.6198,
1544
+ "rewards/accuracies": 0.6499999761581421,
1545
+ "rewards/chosen": -0.06762387603521347,
1546
+ "rewards/margins": 0.35773369669914246,
1547
+ "rewards/rejected": -0.4253575801849365,
1548
+ "step": 920
1549
+ },
1550
+ {
1551
+ "epoch": 1.9471342580476314,
1552
+ "grad_norm": 27.5401666731791,
1553
+ "learning_rate": 1.929337314139412e-09,
1554
+ "logits/chosen": -1.2926667928695679,
1555
+ "logits/rejected": -1.3644298315048218,
1556
+ "logps/chosen": -362.7403869628906,
1557
+ "logps/rejected": -317.42303466796875,
1558
+ "loss": 0.6124,
1559
+ "rewards/accuracies": 0.6875,
1560
+ "rewards/chosen": 0.01872597262263298,
1561
+ "rewards/margins": 0.4235456883907318,
1562
+ "rewards/rejected": -0.40481966733932495,
1563
+ "step": 930
1564
+ },
1565
+ {
1566
+ "epoch": 1.96807118555352,
1567
+ "grad_norm": 31.615150149296902,
1568
+ "learning_rate": 6.567894177967325e-10,
1569
+ "logits/chosen": -1.3021799325942993,
1570
+ "logits/rejected": -1.3374966382980347,
1571
+ "logps/chosen": -324.93255615234375,
1572
+ "logps/rejected": -273.21929931640625,
1573
+ "loss": 0.6255,
1574
+ "rewards/accuracies": 0.6937500238418579,
1575
+ "rewards/chosen": -0.05069868639111519,
1576
+ "rewards/margins": 0.33984681963920593,
1577
+ "rewards/rejected": -0.390545517206192,
1578
+ "step": 940
1579
+ },
1580
+ {
1581
+ "epoch": 1.9890081130594086,
1582
+ "grad_norm": 29.09184931904493,
1583
+ "learning_rate": 5.3626246194704575e-11,
1584
+ "logits/chosen": -1.3507764339447021,
1585
+ "logits/rejected": -1.4648717641830444,
1586
+ "logps/chosen": -361.6529235839844,
1587
+ "logps/rejected": -296.1907043457031,
1588
+ "loss": 0.6169,
1589
+ "rewards/accuracies": 0.637499988079071,
1590
+ "rewards/chosen": -0.08508863300085068,
1591
+ "rewards/margins": 0.3246951401233673,
1592
+ "rewards/rejected": -0.40978384017944336,
1593
+ "step": 950
1594
+ },
1595
+ {
1596
+ "epoch": 1.9973828840617638,
1597
+ "step": 954,
1598
+ "total_flos": 0.0,
1599
+ "train_loss": 0.6516540072998911,
1600
+ "train_runtime": 5944.7081,
1601
+ "train_samples_per_second": 20.568,
1602
+ "train_steps_per_second": 0.16
1603
+ }
1604
+ ],
1605
+ "logging_steps": 10,
1606
+ "max_steps": 954,
1607
+ "num_input_tokens_seen": 0,
1608
+ "num_train_epochs": 2,
1609
+ "save_steps": 500,
1610
+ "stateful_callbacks": {
1611
+ "TrainerControl": {
1612
+ "args": {
1613
+ "should_epoch_stop": false,
1614
+ "should_evaluate": false,
1615
+ "should_log": false,
1616
+ "should_save": false,
1617
+ "should_training_stop": false
1618
+ },
1619
+ "attributes": {}
1620
+ }
1621
+ },
1622
+ "total_flos": 0.0,
1623
+ "train_batch_size": 2,
1624
+ "trial_name": null,
1625
+ "trial_params": null
1626
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7e0f2e2a3074b496c74cd65dd47e8da90de5ccf808bc233cb1420827c7d68f
3
+ size 6520
vocab.json ADDED
The diff for this file is too large to render. See raw diff