File size: 1,535 Bytes
ff2cfed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
print("Starting training process...")
from datasets import load_dataset
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
Trainer,
DataCollatorForSeq2Seq
)
from training_config import training_args
# Load dataset
dataset = load_dataset("health360/Healix-Shot", split=f"train[:100000]")
# Initialize model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
def tokenize_function(examples):
return tokenizer(
examples['text'],
padding="max_length",
truncation=True,
max_length=512,
return_attention_mask=True
)
# Process dataset
train_test_split = dataset.train_test_split(test_size=0.1)
tokenized_train = train_test_split['train'].map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names
)
tokenized_eval = train_test_split['test'].map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
)
# Train and save
print("Starting the training...")
trainer.train()
print("Training complete, saving model...")
model.push_to_hub("MjolnirThor/flan-t5-custom-handler")
tokenizer.push_to_hub("MjolnirThor/flan-t5-custom-handler")
print("Model saved successfully!") |