mtasic85 commited on
Commit
58d313d
1 Parent(s): 39e0190

train model

Browse files
scripts/train_model.py CHANGED
@@ -1,7 +1,16 @@
1
  import gc
 
2
 
3
  from datasets import load_dataset, Dataset
4
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
5
 
6
 
7
  def _batch_iterator():
@@ -36,6 +45,7 @@ def _batch_iterator():
36
 
37
  del dataset
38
  gc.collect()
 
39
 
40
  # text
41
  dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
@@ -180,5 +190,60 @@ tokenizer = AutoTokenizer.from_pretrained('../')
180
 
181
  dataset = Dataset.from_generator(batch_iterator)
182
  print(dataset)
183
- print(dir(dataset))
184
- input()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gc
2
+ import sys
3
 
4
  from datasets import load_dataset, Dataset
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
6
+ from transformers import AutoConfig
7
+ from transformers import DataCollatorForLanguageModeling
8
+
9
+
10
+ x = input('Are you sure? [y/N]')
11
+
12
+ if x not in ('y', 'Y', 'yes'):
13
+ sys.exit(0)
14
 
15
 
16
  def _batch_iterator():
 
45
 
46
  del dataset
47
  gc.collect()
48
+ return
49
 
50
  # text
51
  dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
 
190
 
191
  dataset = Dataset.from_generator(batch_iterator)
192
  print(dataset)
193
+
194
+
195
+ def tokenize_function(examples):
196
+ outputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=32 * 1024)
197
+ outputs['labels'] = outputs['input_ids'].copy()
198
+ return outputs
199
+
200
+
201
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
202
+ tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.01)
203
+
204
+ config = AutoConfig.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3')
205
+ config.bos_token_id = tokenizer.bos_token_id
206
+ config.eos_token_id = tokenizer.eos_token_id
207
+ config.unk_token_id = tokenizer.unk_token_id
208
+ config.pad_token_id = tokenizer.pad_token_id
209
+ config.hidden_size = 512
210
+ config.intermediate_size = int(512 * 3.5) # 1792
211
+ config.max_position_embeddings = 32 * 1024 # 32768
212
+ config.num_attention_heads = 12
213
+ config.num_hidden_layers = 10
214
+ config.num_key_value_heads = 4
215
+ config.rope_theta = 1_000_000.0
216
+ config.sliding_window = 4096
217
+ print(config)
218
+
219
+ model = AutoModelForCausalLM.from_config(config)
220
+ print(model)
221
+
222
+ training_args = TrainingArguments(
223
+ output_dir='./results',
224
+ num_train_epochs=3,
225
+ per_device_train_batch_size=2, # Adjust based on your GPU memory
226
+ per_device_eval_batch_size=2,
227
+ warmup_steps=500,
228
+ weight_decay=0.01,
229
+ logging_dir='./logs',
230
+ logging_steps=10,
231
+ fp16=False,
232
+ bf16=True,
233
+ )
234
+ print(training_args)
235
+
236
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
237
+ print(data_collator)
238
+
239
+ trainer = Trainer(
240
+ model=model,
241
+ args=training_args,
242
+ train_dataset=tokenized_datasets['train'],
243
+ eval_dataset=tokenized_datasets['test'],
244
+ tokenizer=tokenizer,
245
+ data_collator=data_collator,
246
+ )
247
+
248
+ print(trainer)
249
+ trainer.train()
scripts/train_tokenizer.py CHANGED
@@ -1,4 +1,5 @@
1
  import gc
 
2
  import string
3
 
4
  from datasets import load_dataset
@@ -9,6 +10,12 @@ from tokenizers.trainers import BpeTrainer
9
  from tokenizers.processors import TemplateProcessing
10
 
11
 
 
 
 
 
 
 
12
  def batch_iterator():
13
  # code
14
  dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
 
1
  import gc
2
+ import sys
3
  import string
4
 
5
  from datasets import load_dataset
 
10
  from tokenizers.processors import TemplateProcessing
11
 
12
 
13
+ x = input('Are you sure?')
14
+
15
+ if x not in ('y', 'Y', 'yes'):
16
+ sys.exit(0)
17
+
18
+
19
  def batch_iterator():
20
  # code
21
  dataset = load_dataset('bigcode/programming-languages-keywords', split='train')