Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import platform
|
3 |
from dotenv import load_dotenv
|
4 |
import torch
|
5 |
-
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
|
6 |
from datasets import load_dataset, concatenate_datasets
|
7 |
from huggingface_hub import login
|
8 |
import time
|
@@ -44,7 +44,7 @@ async def root():
|
|
44 |
def load_and_train():
|
45 |
model_name = 'gpt2'
|
46 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
47 |
-
model = GPT2LMHeadModel.from_pretrained(model_name)
|
48 |
|
49 |
# Asignar el pad_token al eos_token
|
50 |
tokenizer.pad_token = tokenizer.eos_token
|
@@ -128,13 +128,14 @@ def load_and_train():
|
|
128 |
|
129 |
# Función de tokenización basada en el campo 'text'
|
130 |
def tokenize_function(examples):
|
131 |
-
|
132 |
examples['text'],
|
133 |
truncation=True,
|
134 |
padding='max_length',
|
135 |
max_length=512
|
136 |
-
# clean_up_tokenization_spaces=True # Eliminado porque no es reconocido
|
137 |
)
|
|
|
|
|
138 |
|
139 |
# Tokenizar el dataset
|
140 |
tokenized_dataset = combined_dataset.map(
|
@@ -142,6 +143,12 @@ def load_and_train():
|
|
142 |
batched=True
|
143 |
)
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
# Configurar argumentos de entrenamiento
|
146 |
training_args = TrainingArguments(
|
147 |
output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
|
@@ -164,6 +171,7 @@ def load_and_train():
|
|
164 |
model=model,
|
165 |
args=training_args,
|
166 |
train_dataset=tokenized_dataset,
|
|
|
167 |
)
|
168 |
|
169 |
while True:
|
|
|
2 |
import platform
|
3 |
from dotenv import load_dotenv
|
4 |
import torch
|
5 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
6 |
from datasets import load_dataset, concatenate_datasets
|
7 |
from huggingface_hub import login
|
8 |
import time
|
|
|
44 |
def load_and_train():
|
45 |
model_name = 'gpt2'
|
46 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
47 |
+
model = GPT2LMHeadModel.from_pretrained(model_name, return_dict=True)
|
48 |
|
49 |
# Asignar el pad_token al eos_token
|
50 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
128 |
|
129 |
# Función de tokenización basada en el campo 'text'
|
130 |
def tokenize_function(examples):
|
131 |
+
tokenized = tokenizer(
|
132 |
examples['text'],
|
133 |
truncation=True,
|
134 |
padding='max_length',
|
135 |
max_length=512
|
|
|
136 |
)
|
137 |
+
tokenized['labels'] = tokenized['input_ids'].copy()
|
138 |
+
return tokenized
|
139 |
|
140 |
# Tokenizar el dataset
|
141 |
tokenized_dataset = combined_dataset.map(
|
|
|
143 |
batched=True
|
144 |
)
|
145 |
|
146 |
+
# Configurar el Data Collator
|
147 |
+
data_collator = DataCollatorForLanguageModeling(
|
148 |
+
tokenizer=tokenizer,
|
149 |
+
mlm=False # Para modelado de lenguaje causal
|
150 |
+
)
|
151 |
+
|
152 |
# Configurar argumentos de entrenamiento
|
153 |
training_args = TrainingArguments(
|
154 |
output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
|
|
|
171 |
model=model,
|
172 |
args=training_args,
|
173 |
train_dataset=tokenized_dataset,
|
174 |
+
data_collator=data_collator,
|
175 |
)
|
176 |
|
177 |
while True:
|