train model
Browse files- scripts/train_model.py +68 -3
- scripts/train_tokenizer.py +7 -0
scripts/train_model.py
CHANGED
@@ -1,7 +1,16 @@
|
|
1 |
import gc
|
|
|
2 |
|
3 |
from datasets import load_dataset, Dataset
|
4 |
-
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def _batch_iterator():
|
@@ -36,6 +45,7 @@ def _batch_iterator():
|
|
36 |
|
37 |
del dataset
|
38 |
gc.collect()
|
|
|
39 |
|
40 |
# text
|
41 |
dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
|
@@ -180,5 +190,60 @@ tokenizer = AutoTokenizer.from_pretrained('../')
|
|
180 |
|
181 |
dataset = Dataset.from_generator(batch_iterator)
|
182 |
print(dataset)
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gc
|
2 |
+
import sys
|
3 |
|
4 |
from datasets import load_dataset, Dataset
|
5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
6 |
+
from transformers import AutoConfig
|
7 |
+
from transformers import DataCollatorForLanguageModeling
|
8 |
+
|
9 |
+
|
10 |
+
x = input('Are you sure? [y/N]')
|
11 |
+
|
12 |
+
if x not in ('y', 'Y', 'yes'):
|
13 |
+
sys.exit(0)
|
14 |
|
15 |
|
16 |
def _batch_iterator():
|
|
|
45 |
|
46 |
del dataset
|
47 |
gc.collect()
|
48 |
+
return
|
49 |
|
50 |
# text
|
51 |
dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
|
|
|
190 |
|
191 |
dataset = Dataset.from_generator(batch_iterator)
|
192 |
print(dataset)
|
193 |
+
|
194 |
+
|
195 |
+
def tokenize_function(examples):
|
196 |
+
outputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=32 * 1024)
|
197 |
+
outputs['labels'] = outputs['input_ids'].copy()
|
198 |
+
return outputs
|
199 |
+
|
200 |
+
|
201 |
+
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
202 |
+
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.01)
|
203 |
+
|
204 |
+
config = AutoConfig.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3')
|
205 |
+
config.bos_token_id = tokenizer.bos_token_id
|
206 |
+
config.eos_token_id = tokenizer.eos_token_id
|
207 |
+
config.unk_token_id = tokenizer.unk_token_id
|
208 |
+
config.pad_token_id = tokenizer.pad_token_id
|
209 |
+
config.hidden_size = 512
|
210 |
+
config.intermediate_size = int(512 * 3.5) # 1792
|
211 |
+
config.max_position_embeddings = 32 * 1024 # 32768
|
212 |
+
config.num_attention_heads = 12
|
213 |
+
config.num_hidden_layers = 10
|
214 |
+
config.num_key_value_heads = 4
|
215 |
+
config.rope_theta = 1_000_000.0
|
216 |
+
config.sliding_window = 4096
|
217 |
+
print(config)
|
218 |
+
|
219 |
+
model = AutoModelForCausalLM.from_config(config)
|
220 |
+
print(model)
|
221 |
+
|
222 |
+
training_args = TrainingArguments(
|
223 |
+
output_dir='./results',
|
224 |
+
num_train_epochs=3,
|
225 |
+
per_device_train_batch_size=2, # Adjust based on your GPU memory
|
226 |
+
per_device_eval_batch_size=2,
|
227 |
+
warmup_steps=500,
|
228 |
+
weight_decay=0.01,
|
229 |
+
logging_dir='./logs',
|
230 |
+
logging_steps=10,
|
231 |
+
fp16=False,
|
232 |
+
bf16=True,
|
233 |
+
)
|
234 |
+
print(training_args)
|
235 |
+
|
236 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
237 |
+
print(data_collator)
|
238 |
+
|
239 |
+
trainer = Trainer(
|
240 |
+
model=model,
|
241 |
+
args=training_args,
|
242 |
+
train_dataset=tokenized_datasets['train'],
|
243 |
+
eval_dataset=tokenized_datasets['test'],
|
244 |
+
tokenizer=tokenizer,
|
245 |
+
data_collator=data_collator,
|
246 |
+
)
|
247 |
+
|
248 |
+
print(trainer)
|
249 |
+
trainer.train()
|
scripts/train_tokenizer.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gc
|
|
|
2 |
import string
|
3 |
|
4 |
from datasets import load_dataset
|
@@ -9,6 +10,12 @@ from tokenizers.trainers import BpeTrainer
|
|
9 |
from tokenizers.processors import TemplateProcessing
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def batch_iterator():
|
13 |
# code
|
14 |
dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
|
|
|
1 |
import gc
|
2 |
+
import sys
|
3 |
import string
|
4 |
|
5 |
from datasets import load_dataset
|
|
|
10 |
from tokenizers.processors import TemplateProcessing
|
11 |
|
12 |
|
13 |
+
x = input('Are you sure?')
|
14 |
+
|
15 |
+
if x not in ('y', 'Y', 'yes'):
|
16 |
+
sys.exit(0)
|
17 |
+
|
18 |
+
|
19 |
def batch_iterator():
|
20 |
# code
|
21 |
dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
|