|
import os |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from datasets import load_dataset |
|
from huggingface_hub import login |
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling |
|
|
|
|
|
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") |
|
login(token=hf_token) |
|
|
|
|
|
MODEL_NAME = "vv876803/tinyllama-victor" |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token) |
|
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token) |
|
|
|
|
|
peft_config = LoraConfig( |
|
r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" |
|
) |
|
model = prepare_model_for_kbit_training(model) |
|
model = get_peft_model(model, peft_config) |
|
|
|
|
|
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]") |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], truncation=True, max_length=512) |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=False |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./tinyllama-finetuned", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
num_train_epochs=3, |
|
logging_dir="./logs", |
|
logging_steps=10, |
|
save_total_limit=2, |
|
fp16=False, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./tinyllama-finetuned") |
|
tokenizer.save_pretrained("./tinyllama-finetuned") |
|
|