import torch import json import os from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq from datasets import Dataset # ✅ 1. Ielādē treniņa datus with open("train.json", "r", encoding="utf-8") as f: train_data = json.load(f) # ✅ 2. Pārveido datus Hugging Face dataset formātā dataset = Dataset.from_list( [{"input_text": d["jautājums"], "target_text": d["atbilde"]} for d in train_data] ) # ✅ 3. Sadala datus treniņam un testam (80% treniņam, 20% testam) train_test_split = dataset.train_test_split(test_size=0.2) train_dataset = train_test_split["train"] eval_dataset = train_test_split["test"] # ✅ 4. Izvēlas modeli (`mT5-small` vai citu) model_name = "google/mt5-small" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name).to(device) # ✅ 5. Sagatavo datus treniņam def preprocess_data(examples): model_inputs = tokenizer(examples["input_text"], max_length=128, truncation=True) labels = tokenizer(examples["target_text"], max_length=128, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs train_dataset = train_dataset.map(preprocess_data, batched=True) eval_dataset = eval_dataset.map(preprocess_data, batched=True) # ✅ 6. Definē treniņa parametrus training_args = TrainingArguments( output_dir="./trained_model", evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, push_to_hub=False, # ✅ Pārvērties True, ja vēlies augšupielādēt Hugging Face Hub ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # ✅ 7. Izveidot `Trainer` klasi trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, ) # ✅ 8. Sāc AI apmācību trainer.train() # ✅ 9. Saglabā trenēto modeli model.save_pretrained("./trained_model") tokenizer.save_pretrained("./trained_model") # ✅ 10. Novērtē AI precizitāti pēc apmācības results = trainer.evaluate() print("📊 Testēšanas rezultāti:", results) # ✅ 11. Ja vēlies augšupielādēt AI uz Hugging Face Hub # (Pirms tam jāpieslēdzas Hugging Face ar `notebook_login()`) # trainer.push_to_hub("lietotajsvards/elektro-ai")