victor / train.py
vv876803's picture
Update train.py
3c45579 verified
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
# Authenticate with Hugging Face (replace with your token)
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
login(token=hf_token)
# Load Pretrained TinyLlama Model & Tokenizer
MODEL_NAME = "vv876803/tinyllama-victor" # Replace with your TinyLlama model name if different
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
# Prepare LoRA for Efficient Training
peft_config = LoraConfig(
r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model) # Ensures LoRA compatibility
model = get_peft_model(model, peft_config)
# Load Dataset (Example: OpenAssistant OASST1)
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")
# Tokenize Dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False # Causal Language Modeling (not MLM)
)
# Training Arguments (use `cpu` as the accelerator here for free CPU)
training_args = TrainingArguments(
output_dir="./tinyllama-finetuned", # Directory to save the fine-tuned model
evaluation_strategy="epoch", # Evaluate after each epoch
save_strategy="epoch", # Save after each epoch
per_device_train_batch_size=2, # Small batch size due to limited resources
per_device_eval_batch_size=2,
num_train_epochs=3,
logging_dir="./logs", # Log directory
logging_steps=10,
save_total_limit=2, # Limit the number of saved checkpoints
fp16=False, # Use float32 as we're using CPU
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
data_collator=data_collator,
)
# Train the model
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./tinyllama-finetuned")
tokenizer.save_pretrained("./tinyllama-finetuned")