dgdgdgdgd / app.py
Yhhxhfh's picture
Rename app,py to app.py
94c9328 verified
raw
history blame
2.12 kB
import os
from dotenv import load_dotenv
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from autotrain import AutoTrain
import time
load_dotenv()
login(token=os.getenv('HUGGINGFACE_TOKEN'))
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
dataset_humanizado = load_dataset('daily_dialog', split='train')
dataset_codigo = load_dataset('code_search_net', split='train')
dataset_prompts = load_dataset('openai_humaneval', split='train')
combined_dataset = concatenate_datasets([
dataset_humanizado,
dataset_codigo,
dataset_prompts
])
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)
training_args = {
"output_dir": './results',
"per_device_train_batch_size": 100,
"per_device_eval_batch_size": 100,
"num_train_epochs": 0,
"learning_rate": 1e-5,
"logging_steps": -1,
"max_grad_norm": 1,
"save_total_limit": 1,
"seed": 42,
"weight_decay": 0,
"warmup_ratio": 0.0,
"evaluation_strategy": "no",
"optim": "adamw_torch",
"lr_scheduler_type": "constant",
"model_max_length": 2098989848
}
autotrain = AutoTrain(model=model, args=training_args)
@spaces.gpu
def run_training():
while True:
try:
autotrain.train(tokenized_dataset)
model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualizaci贸n del modelo")
tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualizaci贸n del tokenizador")
time.sleep(5)
except Exception as e:
print(f"Error durante el entrenamiento: {e}. Reiniciando el proceso de entrenamiento...")
time.sleep(10)
run_training()
import shutil
shutil.rmtree('./results', ignore_errors=True)