|
import torch |
|
import json |
|
import os |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq |
|
from datasets import Dataset |
|
|
|
|
|
with open("train.json", "r", encoding="utf-8") as f: |
|
train_data = json.load(f) |
|
|
|
|
|
dataset = Dataset.from_list([{"input_text": d["question"], "target_text": d["answer"]} for d in train_data]) |
|
|
|
|
|
model_name = "google/mt5-small" |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device) |
|
|
|
|
|
def preprocess_data(examples): |
|
model_inputs = tokenizer(examples["input_text"], max_length=128, truncation=True) |
|
labels = tokenizer(examples["target_text"], max_length=128, truncation=True) |
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
dataset = dataset.map(preprocess_data, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./trained_model", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
learning_rate=5e-5, |
|
per_device_train_batch_size=4, |
|
per_device_eval_batch_size=4, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
push_to_hub=True, |
|
logging_dir="./logs" |
|
) |
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
tokenizer=tokenizer, |
|
data_collator=data_collator |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./trained_model") |
|
tokenizer.save_pretrained("./trained_model") |
|
|
|
|
|
trainer.push_to_hub() |
|
|
|
|