Yhhxhfh commited on
Commit
f665e00
1 Parent(s): 8ddb144

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -4
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import platform
3
  from dotenv import load_dotenv
4
  import torch
5
- from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
6
  from datasets import load_dataset, concatenate_datasets
7
  from huggingface_hub import login
8
  import time
@@ -44,7 +44,7 @@ async def root():
44
  def load_and_train():
45
  model_name = 'gpt2'
46
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
47
- model = GPT2LMHeadModel.from_pretrained(model_name)
48
 
49
  # Asignar el pad_token al eos_token
50
  tokenizer.pad_token = tokenizer.eos_token
@@ -128,13 +128,14 @@ def load_and_train():
128
 
129
  # Función de tokenización basada en el campo 'text'
130
  def tokenize_function(examples):
131
- return tokenizer(
132
  examples['text'],
133
  truncation=True,
134
  padding='max_length',
135
  max_length=512
136
- # clean_up_tokenization_spaces=True # Eliminado porque no es reconocido
137
  )
 
 
138
 
139
  # Tokenizar el dataset
140
  tokenized_dataset = combined_dataset.map(
@@ -142,6 +143,12 @@ def load_and_train():
142
  batched=True
143
  )
144
 
 
 
 
 
 
 
145
  # Configurar argumentos de entrenamiento
146
  training_args = TrainingArguments(
147
  output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
@@ -164,6 +171,7 @@ def load_and_train():
164
  model=model,
165
  args=training_args,
166
  train_dataset=tokenized_dataset,
 
167
  )
168
 
169
  while True:
 
2
  import platform
3
  from dotenv import load_dotenv
4
  import torch
5
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
6
  from datasets import load_dataset, concatenate_datasets
7
  from huggingface_hub import login
8
  import time
 
44
  def load_and_train():
45
  model_name = 'gpt2'
46
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
47
+ model = GPT2LMHeadModel.from_pretrained(model_name, return_dict=True)
48
 
49
  # Asignar el pad_token al eos_token
50
  tokenizer.pad_token = tokenizer.eos_token
 
128
 
129
  # Función de tokenización basada en el campo 'text'
130
  def tokenize_function(examples):
131
+ tokenized = tokenizer(
132
  examples['text'],
133
  truncation=True,
134
  padding='max_length',
135
  max_length=512
 
136
  )
137
+ tokenized['labels'] = tokenized['input_ids'].copy()
138
+ return tokenized
139
 
140
  # Tokenizar el dataset
141
  tokenized_dataset = combined_dataset.map(
 
143
  batched=True
144
  )
145
 
146
+ # Configurar el Data Collator
147
+ data_collator = DataCollatorForLanguageModeling(
148
+ tokenizer=tokenizer,
149
+ mlm=False # Para modelado de lenguaje causal
150
+ )
151
+
152
  # Configurar argumentos de entrenamiento
153
  training_args = TrainingArguments(
154
  output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
 
171
  model=model,
172
  args=training_args,
173
  train_dataset=tokenized_dataset,
174
+ data_collator=data_collator,
175
  )
176
 
177
  while True: