import torch import json import os from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq from datasets import Dataset # ✅ 1. Ielādē treniņa datus with open("train.json", "r", encoding="utf-8") as f: train_data = json.load(f) # ✅ 2. Pārveido datus Hugging Face formātā dataset = Dataset.from_list([{"input_text": d["question"], "target_text": d["answer"]} for d in train_data]) # ✅ 3. Izvēlies modeli (piemēram, `mT5-small` vai `facebook/opt-1.3b`) model_name = "google/mt5-small" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name).to(device) # ✅ 4. Sagatavo datus treniņam def preprocess_data(examples): model_inputs = tokenizer(examples["input_text"], max_length=128, truncation=True) labels = tokenizer(examples["target_text"], max_length=128, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs dataset = dataset.map(preprocess_data, batched=True) # ✅ 5. Definē trenēšanas parametrus training_args = TrainingArguments( output_dir="./trained_model", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, # Trenē 3 epohas weight_decay=0.01, push_to_hub=True, # Augšupielādē Hugging Face Hub logging_dir="./logs" ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # ✅ 6. Izveido `Trainer` klasi trainer = Trainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer, data_collator=data_collator ) # ✅ 7. Sāc trenēšanu trainer.train() # ✅ 8. Saglabā trenēto modeli model.save_pretrained("./trained_model") tokenizer.save_pretrained("./trained_model") # ✅ 9. Augšupielādē Hugging Face Hub (ja nepieciešams) trainer.push_to_hub()