kyryl-georgian commited on
Commit
6da15f5
1 Parent(s): 6f4a345

End of training

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [google/flan-t5-small](https://huggingface.co/google/flan-t5-small) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 1.6025
20
 
21
  ## Model description
22
 
@@ -39,12 +39,29 @@ The following hyperparameters were used during training:
39
  - train_batch_size: 16
40
  - eval_batch_size: 16
41
  - seed: 42
 
 
 
 
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
  - lr_scheduler_type: linear
44
- - num_epochs: 0.001
45
 
46
  ### Training results
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [google/flan-t5-small](https://huggingface.co/google/flan-t5-small) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.1072
20
 
21
  ## Model description
22
 
 
39
  - train_batch_size: 16
40
  - eval_batch_size: 16
41
  - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 8
44
+ - total_train_batch_size: 128
45
+ - total_eval_batch_size: 128
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 10.0
49
 
50
  ### Training results
51
 
52
+ | Training Loss | Epoch | Step | Validation Loss |
53
+ |:-------------:|:-----:|:----:|:---------------:|
54
+ | 0.4003 | 0.9 | 500 | 0.1629 |
55
+ | 0.2314 | 1.81 | 1000 | 0.1386 |
56
+ | 0.2065 | 2.71 | 1500 | 0.1289 |
57
+ | 0.187 | 3.62 | 2000 | 0.1233 |
58
+ | 0.1791 | 4.52 | 2500 | 0.1169 |
59
+ | 0.1713 | 5.42 | 3000 | 0.1153 |
60
+ | 0.1661 | 6.33 | 3500 | 0.1122 |
61
+ | 0.1604 | 7.23 | 4000 | 0.1085 |
62
+ | 0.1574 | 8.14 | 4500 | 0.1099 |
63
+ | 0.1541 | 9.04 | 5000 | 0.1064 |
64
+ | 0.1521 | 9.95 | 5500 | 0.1071 |
65
 
66
 
67
  ### Framework versions
adapter_config.json CHANGED
@@ -19,8 +19,8 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "q",
23
- "v"
24
  ],
25
  "task_type": "SEQ_2_SEQ_LM"
26
  }
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "v",
23
+ "q"
24
  ],
25
  "task_type": "SEQ_2_SEQ_LM"
26
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:088861e13048d008a84b5bcea780b9d9f5ad691da24ba9da77ef998a2c522c2f
3
  size 2765880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4af10a22bd170f6a681891351deeaa4edddd58bc62b4d0732320b6f5fcd408fd
3
  size 2765880
all_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "epoch": 0.0,
3
- "eval_loss": 1.6024976968765259,
4
- "eval_runtime": 13.4905,
5
- "eval_samples_per_second": 582.482,
6
- "eval_steps_per_second": 36.47,
7
- "train_loss": 2.5696409225463865,
8
- "train_runtime": 1.0509,
9
- "train_samples_per_second": 67.292,
10
- "train_steps_per_second": 4.758
11
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 0.10718318819999695,
4
+ "eval_runtime": 3.0141,
5
+ "eval_samples_per_second": 2607.118,
6
+ "eval_steps_per_second": 20.57,
7
+ "train_loss": 0.19658087959772425,
8
+ "train_runtime": 755.9435,
9
+ "train_samples_per_second": 935.506,
10
+ "train_steps_per_second": 7.315
11
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 0.0,
3
- "eval_loss": 1.6024976968765259,
4
- "eval_runtime": 13.4905,
5
- "eval_samples_per_second": 582.482,
6
- "eval_steps_per_second": 36.47
7
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 0.10718318819999695,
4
+ "eval_runtime": 3.0141,
5
+ "eval_samples_per_second": 2607.118,
6
+ "eval_steps_per_second": 20.57
7
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 0.0,
3
- "train_loss": 2.5696409225463865,
4
- "train_runtime": 1.0509,
5
- "train_samples_per_second": 67.292,
6
- "train_steps_per_second": 4.758
7
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.19658087959772425,
4
+ "train_runtime": 755.9435,
5
+ "train_samples_per_second": 935.506,
6
+ "train_steps_per_second": 7.315
7
  }
trainer_state.json CHANGED
@@ -1,29 +1,194 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0011312217194570137,
5
  "eval_steps": 500,
6
- "global_step": 5,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "step": 5,
14
- "total_flos": 2585060966400.0,
15
- "train_loss": 2.5696409225463865,
16
- "train_runtime": 1.0509,
17
- "train_samples_per_second": 67.292,
18
- "train_steps_per_second": 4.758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
  ],
21
  "logging_steps": 500,
22
- "max_steps": 5,
23
  "num_input_tokens_seen": 0,
24
- "num_train_epochs": 1,
25
  "save_steps": 500,
26
- "total_flos": 2585060966400.0,
27
  "train_batch_size": 16,
28
  "trial_name": null,
29
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 5530,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9,
13
+ "grad_norm": 0.1664367914199829,
14
+ "learning_rate": 0.0009095840867992767,
15
+ "loss": 0.4003,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.9,
20
+ "eval_loss": 0.162927508354187,
21
+ "eval_runtime": 3.0318,
22
+ "eval_samples_per_second": 2591.854,
23
+ "eval_steps_per_second": 20.45,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 1.81,
28
+ "grad_norm": 0.1386815905570984,
29
+ "learning_rate": 0.0008191681735985533,
30
+ "loss": 0.2314,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 1.81,
35
+ "eval_loss": 0.1386137157678604,
36
+ "eval_runtime": 3.0434,
37
+ "eval_samples_per_second": 2582.007,
38
+ "eval_steps_per_second": 20.372,
39
+ "step": 1000
40
+ },
41
+ {
42
+ "epoch": 2.71,
43
+ "grad_norm": 0.1781063824892044,
44
+ "learning_rate": 0.0007287522603978301,
45
+ "loss": 0.2065,
46
+ "step": 1500
47
+ },
48
+ {
49
+ "epoch": 2.71,
50
+ "eval_loss": 0.1289130598306656,
51
+ "eval_runtime": 3.0677,
52
+ "eval_samples_per_second": 2561.542,
53
+ "eval_steps_per_second": 20.211,
54
+ "step": 1500
55
+ },
56
+ {
57
+ "epoch": 3.62,
58
+ "grad_norm": 0.15570929646492004,
59
+ "learning_rate": 0.0006383363471971068,
60
+ "loss": 0.187,
61
+ "step": 2000
62
+ },
63
+ {
64
+ "epoch": 3.62,
65
+ "eval_loss": 0.12326223403215408,
66
+ "eval_runtime": 3.0605,
67
+ "eval_samples_per_second": 2567.579,
68
+ "eval_steps_per_second": 20.258,
69
+ "step": 2000
70
+ },
71
+ {
72
+ "epoch": 4.52,
73
+ "grad_norm": 0.16776247322559357,
74
+ "learning_rate": 0.0005479204339963833,
75
+ "loss": 0.1791,
76
+ "step": 2500
77
+ },
78
+ {
79
+ "epoch": 4.52,
80
+ "eval_loss": 0.1168670803308487,
81
+ "eval_runtime": 3.0473,
82
+ "eval_samples_per_second": 2578.705,
83
+ "eval_steps_per_second": 20.346,
84
+ "step": 2500
85
+ },
86
+ {
87
+ "epoch": 5.42,
88
+ "grad_norm": 0.1355486512184143,
89
+ "learning_rate": 0.0004575045207956601,
90
+ "loss": 0.1713,
91
+ "step": 3000
92
+ },
93
+ {
94
+ "epoch": 5.42,
95
+ "eval_loss": 0.11528698354959488,
96
+ "eval_runtime": 3.0013,
97
+ "eval_samples_per_second": 2618.163,
98
+ "eval_steps_per_second": 20.657,
99
+ "step": 3000
100
+ },
101
+ {
102
+ "epoch": 6.33,
103
+ "grad_norm": 0.16372531652450562,
104
+ "learning_rate": 0.0003670886075949367,
105
+ "loss": 0.1661,
106
+ "step": 3500
107
+ },
108
+ {
109
+ "epoch": 6.33,
110
+ "eval_loss": 0.11218289285898209,
111
+ "eval_runtime": 2.9586,
112
+ "eval_samples_per_second": 2655.959,
113
+ "eval_steps_per_second": 20.956,
114
+ "step": 3500
115
+ },
116
+ {
117
+ "epoch": 7.23,
118
+ "grad_norm": 0.1596778929233551,
119
+ "learning_rate": 0.0002766726943942134,
120
+ "loss": 0.1604,
121
+ "step": 4000
122
+ },
123
+ {
124
+ "epoch": 7.23,
125
+ "eval_loss": 0.1085081547498703,
126
+ "eval_runtime": 2.9539,
127
+ "eval_samples_per_second": 2660.243,
128
+ "eval_steps_per_second": 20.989,
129
+ "step": 4000
130
+ },
131
+ {
132
+ "epoch": 8.14,
133
+ "grad_norm": 0.15582768619060516,
134
+ "learning_rate": 0.00018625678119349006,
135
+ "loss": 0.1574,
136
+ "step": 4500
137
+ },
138
+ {
139
+ "epoch": 8.14,
140
+ "eval_loss": 0.1098729744553566,
141
+ "eval_runtime": 2.9739,
142
+ "eval_samples_per_second": 2642.311,
143
+ "eval_steps_per_second": 20.848,
144
+ "step": 4500
145
+ },
146
+ {
147
+ "epoch": 9.04,
148
+ "grad_norm": 0.15063905715942383,
149
+ "learning_rate": 9.584086799276672e-05,
150
+ "loss": 0.1541,
151
+ "step": 5000
152
+ },
153
+ {
154
+ "epoch": 9.04,
155
+ "eval_loss": 0.10638037323951721,
156
+ "eval_runtime": 3.0665,
157
+ "eval_samples_per_second": 2562.534,
158
+ "eval_steps_per_second": 20.219,
159
+ "step": 5000
160
+ },
161
+ {
162
+ "epoch": 9.95,
163
+ "grad_norm": 0.14130930602550507,
164
+ "learning_rate": 5.4249547920433995e-06,
165
+ "loss": 0.1521,
166
+ "step": 5500
167
+ },
168
+ {
169
+ "epoch": 9.95,
170
+ "eval_loss": 0.1071261540055275,
171
+ "eval_runtime": 3.024,
172
+ "eval_samples_per_second": 2598.51,
173
+ "eval_steps_per_second": 20.502,
174
+ "step": 5500
175
+ },
176
+ {
177
+ "epoch": 10.0,
178
+ "step": 5530,
179
+ "total_flos": 2.2872619342626816e+16,
180
+ "train_loss": 0.19658087959772425,
181
+ "train_runtime": 755.9435,
182
+ "train_samples_per_second": 935.506,
183
+ "train_steps_per_second": 7.315
184
  }
185
  ],
186
  "logging_steps": 500,
187
+ "max_steps": 5530,
188
  "num_input_tokens_seen": 0,
189
+ "num_train_epochs": 10,
190
  "save_steps": 500,
191
+ "total_flos": 2.2872619342626816e+16,
192
  "train_batch_size": 16,
193
  "trial_name": null,
194
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5964aa42a58f9aa0442e4ab87d38a471e1527ac951e91bea56c4d8f43acfd26
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1791b93576383f366ada713fe62fa1a5066567f1c635c3f329bc8f36e8673a58
3
  size 5048