chansung commited on
Commit
f10c408
·
verified ·
1 Parent(s): 6102a60

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.3268
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 0.7852 | 1.0 | 130 | 2.3268 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.2840
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.7805 | 1.0 | 130 | 2.2840 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 2.326753616333008,
4
- "eval_runtime": 0.4763,
5
- "eval_samples": 15,
6
- "eval_samples_per_second": 25.195,
7
- "eval_steps_per_second": 2.1,
8
  "total_flos": 17867533713408.0,
9
- "train_loss": 2.6317037490698008,
10
- "train_runtime": 910.2032,
11
  "train_samples": 111440,
12
- "train_samples_per_second": 36.528,
13
  "train_steps_per_second": 0.143
14
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
3
  "total_flos": 17867533713408.0,
4
+ "train_loss": 2.5464949974646935,
5
+ "train_runtime": 907.3672,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 36.642,
8
  "train_steps_per_second": 0.143
9
  }
config.json CHANGED
@@ -24,6 +24,6 @@
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.46.3",
27
- "use_cache": true,
28
  "vocab_size": 256000
29
  }
 
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.46.3",
27
+ "use_cache": false,
28
  "vocab_size": 256000
29
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e268a4e2fd577919456f59597df331446976ae2d378e1c9ef926ffb6d84644ee
3
  size 4995496656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9b31611844f916240641ce8e3df4f3c49bb2413f2e0e589feb58beb1dbf548
3
  size 4995496656
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23e6cebf4a4318b02f17add4fbd59261a5282794878e3a07737cd2a4ad5c352d
3
  size 4982953168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e4c92be6f7e62fab237512ae1baddbeb4f7a70fa29d41949d7a9754a7dbe334
3
  size 4982953168
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:246a8a6292d87d9f4915fb08ab18eb9b0d6a706a2f61985c4fa70dd49ed20735
3
  size 4982953200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9593409430e230f71effe8a50b391b38d156e3d3bf618d5c04176f93af458f
3
  size 4982953200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dbe3e6ca3dff5ac890e8608614f331cb13b9df79aef9cbb9fd8c0d8c63a2726
3
  size 2113988336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2c2f429d8534eb8ba655e2d9a0056f72a8545d46b09cf7ceef58ebb19255c2
3
  size 2113988336
runs/Nov21_18-12-56_main-fft-gemma7b-closedqa-0-0/events.out.tfevents.1732231455.main-fft-gemma7b-closedqa-0-0.544.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8a4ac83aec09bea9663cb07e472f0c7dc63906f3087493402289c39c0805fe8
3
+ size 12066
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 17867533713408.0,
4
- "train_loss": 2.6317037490698008,
5
- "train_runtime": 910.2032,
6
  "train_samples": 111440,
7
- "train_samples_per_second": 36.528,
8
  "train_steps_per_second": 0.143
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 17867533713408.0,
4
+ "train_loss": 2.5464949974646935,
5
+ "train_runtime": 907.3672,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 36.642,
8
  "train_steps_per_second": 0.143
9
  }
trainer_state.json CHANGED
@@ -10,208 +10,208 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.007692307692307693,
13
- "grad_norm": 1824.052603668407,
14
  "learning_rate": 1.5384615384615387e-06,
15
  "loss": 28.7448,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.038461538461538464,
20
- "grad_norm": 316.6103639423174,
21
  "learning_rate": 7.692307692307694e-06,
22
- "loss": 21.1101,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07692307692307693,
27
- "grad_norm": 65.24932753249017,
28
  "learning_rate": 1.5384615384615387e-05,
29
- "loss": 13.727,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.11538461538461539,
34
- "grad_norm": 42.92103955322172,
35
  "learning_rate": 1.9985583705641418e-05,
36
- "loss": 6.179,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.15384615384615385,
41
- "grad_norm": 440.32026332346305,
42
  "learning_rate": 1.9823877374156647e-05,
43
- "loss": 3.1748,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.19230769230769232,
48
- "grad_norm": 18.09915025408189,
49
  "learning_rate": 1.9485364419471454e-05,
50
- "loss": 2.2618,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.23076923076923078,
55
- "grad_norm": 16.637938118964563,
56
  "learning_rate": 1.8976137276390145e-05,
57
- "loss": 1.6812,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.2692307692307692,
62
- "grad_norm": 10.853465738373286,
63
  "learning_rate": 1.8305360832480118e-05,
64
- "loss": 1.507,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.3076923076923077,
69
- "grad_norm": 14.37878731086315,
70
  "learning_rate": 1.7485107481711014e-05,
71
- "loss": 1.3185,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.34615384615384615,
76
- "grad_norm": 9.489828253754519,
77
  "learning_rate": 1.653013984983585e-05,
78
- "loss": 1.1709,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.38461538461538464,
83
- "grad_norm": 9.57640776000177,
84
  "learning_rate": 1.5457645101945046e-05,
85
- "loss": 1.0826,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.4230769230769231,
90
- "grad_norm": 8.920710354614021,
91
  "learning_rate": 1.4286925614030542e-05,
92
- "loss": 1.022,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.46153846153846156,
97
- "grad_norm": 9.46700699458278,
98
  "learning_rate": 1.303905157574247e-05,
99
- "loss": 0.9738,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.5,
104
- "grad_norm": 6.401169561778988,
105
  "learning_rate": 1.1736481776669307e-05,
106
- "loss": 0.9395,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5384615384615384,
111
- "grad_norm": 3.70149569378144,
112
  "learning_rate": 1.0402659401094154e-05,
113
- "loss": 0.8867,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5769230769230769,
118
- "grad_norm": 3.481283936858623,
119
  "learning_rate": 9.061590105968208e-06,
120
- "loss": 0.8829,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.6153846153846154,
125
- "grad_norm": 7.314694136144165,
126
  "learning_rate": 7.73740997570278e-06,
127
- "loss": 0.8661,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6538461538461539,
132
- "grad_norm": 8.561065827412762,
133
  "learning_rate": 6.453951129574644e-06,
134
- "loss": 0.8615,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6923076923076923,
139
- "grad_norm": 1.800187961424463,
140
  "learning_rate": 5.234312799786921e-06,
141
- "loss": 0.8485,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.7307692307692307,
146
- "grad_norm": 3.8636605355173326,
147
  "learning_rate": 4.100445599768774e-06,
148
- "loss": 0.8287,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7692307692307693,
153
- "grad_norm": 1.6545101747217463,
154
  "learning_rate": 3.0727564649040066e-06,
155
- "loss": 0.8179,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.8076923076923077,
160
- "grad_norm": 1.3925968714516748,
161
  "learning_rate": 2.1697413758237785e-06,
162
- "loss": 0.8102,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8461538461538461,
167
- "grad_norm": 1.6352729776359016,
168
  "learning_rate": 1.407652474377832e-06,
169
- "loss": 0.798,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8846153846153846,
174
- "grad_norm": 0.7372236841264296,
175
  "learning_rate": 8.002055634117578e-07,
176
- "loss": 0.798,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.9230769230769231,
181
- "grad_norm": 0.9284794942335707,
182
  "learning_rate": 3.5833325466437697e-07,
183
- "loss": 0.7863,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9615384615384616,
188
- "grad_norm": 0.7745401578262785,
189
  "learning_rate": 8.99882075409153e-08,
190
- "loss": 0.7789,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 1.0,
195
- "grad_norm": 0.8356209966296326,
196
  "learning_rate": 0.0,
197
- "loss": 0.7852,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 1.0,
202
- "eval_loss": 2.326753616333008,
203
- "eval_runtime": 0.4596,
204
- "eval_samples_per_second": 26.108,
205
- "eval_steps_per_second": 2.176,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 1.0,
210
  "step": 130,
211
  "total_flos": 17867533713408.0,
212
- "train_loss": 2.6317037490698008,
213
- "train_runtime": 910.2032,
214
- "train_samples_per_second": 36.528,
215
  "train_steps_per_second": 0.143
216
  }
217
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.007692307692307693,
13
+ "grad_norm": 1824.0493249890733,
14
  "learning_rate": 1.5384615384615387e-06,
15
  "loss": 28.7448,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.038461538461538464,
20
+ "grad_norm": 338.1051064004804,
21
  "learning_rate": 7.692307692307694e-06,
22
+ "loss": 21.1097,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07692307692307693,
27
+ "grad_norm": 60.27893910460336,
28
  "learning_rate": 1.5384615384615387e-05,
29
+ "loss": 13.8645,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.11538461538461539,
34
+ "grad_norm": 19.31150319920863,
35
  "learning_rate": 1.9985583705641418e-05,
36
+ "loss": 5.697,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.15384615384615385,
41
+ "grad_norm": 69.03270795451746,
42
  "learning_rate": 1.9823877374156647e-05,
43
+ "loss": 2.3869,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.19230769230769232,
48
+ "grad_norm": 76.45052582249555,
49
  "learning_rate": 1.9485364419471454e-05,
50
+ "loss": 1.7302,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.23076923076923078,
55
+ "grad_norm": 18.876039351066055,
56
  "learning_rate": 1.8976137276390145e-05,
57
+ "loss": 1.5732,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.2692307692307692,
62
+ "grad_norm": 23.947629924642698,
63
  "learning_rate": 1.8305360832480118e-05,
64
+ "loss": 1.3462,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.3076923076923077,
69
+ "grad_norm": 24.85999234906976,
70
  "learning_rate": 1.7485107481711014e-05,
71
+ "loss": 1.2076,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.34615384615384615,
76
+ "grad_norm": 16.833283516923675,
77
  "learning_rate": 1.653013984983585e-05,
78
+ "loss": 1.1067,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.38461538461538464,
83
+ "grad_norm": 5.260723361500702,
84
  "learning_rate": 1.5457645101945046e-05,
85
+ "loss": 1.0689,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.4230769230769231,
90
+ "grad_norm": 8.067805674932265,
91
  "learning_rate": 1.4286925614030542e-05,
92
+ "loss": 1.0097,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.46153846153846156,
97
+ "grad_norm": 14.86263828160759,
98
  "learning_rate": 1.303905157574247e-05,
99
+ "loss": 0.978,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.5,
104
+ "grad_norm": 6.459278631918247,
105
  "learning_rate": 1.1736481776669307e-05,
106
+ "loss": 0.9093,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5384615384615384,
111
+ "grad_norm": 2.9078399705453846,
112
  "learning_rate": 1.0402659401094154e-05,
113
+ "loss": 0.8944,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5769230769230769,
118
+ "grad_norm": 7.660903004613179,
119
  "learning_rate": 9.061590105968208e-06,
120
+ "loss": 0.8973,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.6153846153846154,
125
+ "grad_norm": 5.302988730231839,
126
  "learning_rate": 7.73740997570278e-06,
127
+ "loss": 0.8763,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6538461538461539,
132
+ "grad_norm": 4.5745469835172825,
133
  "learning_rate": 6.453951129574644e-06,
134
+ "loss": 0.85,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6923076923076923,
139
+ "grad_norm": 2.9833461950622047,
140
  "learning_rate": 5.234312799786921e-06,
141
+ "loss": 0.8283,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.7307692307692307,
146
+ "grad_norm": 1.3718638590118337,
147
  "learning_rate": 4.100445599768774e-06,
148
+ "loss": 0.8142,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7692307692307693,
153
+ "grad_norm": 1.0682148259587978,
154
  "learning_rate": 3.0727564649040066e-06,
155
+ "loss": 0.8049,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.8076923076923077,
160
+ "grad_norm": 0.9650280690913742,
161
  "learning_rate": 2.1697413758237785e-06,
162
+ "loss": 0.8029,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8461538461538461,
167
+ "grad_norm": 1.0735752809829815,
168
  "learning_rate": 1.407652474377832e-06,
169
+ "loss": 0.793,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8846153846153846,
174
+ "grad_norm": 0.7668233578378257,
175
  "learning_rate": 8.002055634117578e-07,
176
+ "loss": 0.7941,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.9230769230769231,
181
+ "grad_norm": 0.9841012840352416,
182
  "learning_rate": 3.5833325466437697e-07,
183
+ "loss": 0.7823,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9615384615384616,
188
+ "grad_norm": 0.7158242209277041,
189
  "learning_rate": 8.99882075409153e-08,
190
+ "loss": 0.776,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 1.0,
195
+ "grad_norm": 0.8276248743240251,
196
  "learning_rate": 0.0,
197
+ "loss": 0.7805,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 1.0,
202
+ "eval_loss": 2.284010648727417,
203
+ "eval_runtime": 0.4599,
204
+ "eval_samples_per_second": 26.092,
205
+ "eval_steps_per_second": 2.174,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 1.0,
210
  "step": 130,
211
  "total_flos": 17867533713408.0,
212
+ "train_loss": 2.5464949974646935,
213
+ "train_runtime": 907.3672,
214
+ "train_samples_per_second": 36.642,
215
  "train_steps_per_second": 0.143
216
  }
217
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6422201a5ac12b4bbe915d5718a159f31164a382494e678d2426b5c5035ef49e
3
  size 7288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e174f24b0a38ca479c097f126c671739f81f1ab012cec2208f1f9e98f03a8aec
3
  size 7288