import os from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset from huggingface_hub import login from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling # Authenticate with Hugging Face (replace with your token) hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") login(token=hf_token) # Load Pretrained TinyLlama Model & Tokenizer MODEL_NAME = "vv876803/tinyllama-victor" # Replace with your TinyLlama model name if different tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token) # Prepare LoRA for Efficient Training peft_config = LoraConfig( r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = prepare_model_for_kbit_training(model) # Ensures LoRA compatibility model = get_peft_model(model, peft_config) # Load Dataset (Example: OpenAssistant OASST1) dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]") # Tokenize Dataset def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Data Collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False # Causal Language Modeling (not MLM) ) # Training Arguments (use `cpu` as the accelerator here for free CPU) training_args = TrainingArguments( output_dir="./tinyllama-finetuned", # Directory to save the fine-tuned model evaluation_strategy="epoch", # Evaluate after each epoch save_strategy="epoch", # Save after each epoch per_device_train_batch_size=2, # Small batch size due to limited resources per_device_eval_batch_size=2, num_train_epochs=3, logging_dir="./logs", # Log directory logging_steps=10, save_total_limit=2, # Limit the number of saved checkpoints fp16=False, # Use float32 as we're using CPU ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, data_collator=data_collator, ) # Train the model trainer.train() # Save the fine-tuned model and tokenizer model.save_pretrained("./tinyllama-finetuned") tokenizer.save_pretrained("./tinyllama-finetuned")