File size: 5,872 Bytes
6dc4993 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from unsloth import FastLanguageModel
import torch
# Define model parameters
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/tinyllama-chat-bnb-4bit", # "unsloth/tinyllama" for 16bit loading
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
# Apply PEFT (Parameter-Efficient Fine-Tuning)
model = FastLanguageModel.get_peft_model(
model,
r=32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha=32,
lora_dropout=0, # Currently only supports dropout = 0
bias="none", # Currently only supports bias = "none"
use_gradient_checkpointing=False, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
random_state=3407,
use_rslora=False, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
# Data preparation
import pandas as pd
from sklearn.model_selection import train_test_split
import datasets
# Load the dataset
train = datasets.load_dataset("grammarly/coedit", split="train").to_pandas()
val = datasets.load_dataset("grammarly/coedit", split="validation").to_pandas()
# Data cleaning and preparation
data = pd.concat([train, val])
data[['instruction', 'input']] = data['src'].str.split(': ', n=1, expand=True)
data = data.rename(columns={"tgt": "output"})
data = data.drop(columns=["_id", "src"])
# Stratify based on task for balanced splits
stratify_col = data['task']
# Split the data into train and test sets
train_df, test_df = train_test_split(
data,
test_size=0.2,
random_state=42,
stratify=stratify_col
)
def formatting_prompts_func(examples, tokenizer):
"""
Formats the examples into the desired chat format for training.
Args:
examples: A dictionary of examples from the dataset.
tokenizer: The tokenizer used for processing text.
Returns:
A dictionary containing the formatted text for each example.
"""
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
message = [
{"role": "user", "content": instruction + ": " + input},
{"role": "assistant", "content": output},
]
text = tokenizer.apply_chat_template(
message, tokenize=False, add_generation_prompt=False)
texts.append(text)
return {"text": texts, }
# Create datasets from pandas DataFrames
train_ds = datasets.Dataset.from_pandas(train_df)
test_ds = datasets.Dataset.from_pandas(test_df)
# Map the formatting function to the datasets for chat format conversion
train_ds = train_ds.map(formatting_prompts_func, fn_kwargs={"tokenizer": tokenizer}, batched=True,)
test_ds = test_ds.map(formatting_prompts_func, fn_kwargs={"tokenizer": tokenizer}, batched=True,)
print(train_ds[0]['text'])
# Fine-tuning with trl
from trl import SFTTrainer
from transformers import TrainingArguments
# Define training arguments
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_ds,
eval_dataset=test_ds,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=10,
packing=False, # Can make training 5x faster for short sequences.
args=TrainingArguments(
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
gradient_accumulation_steps=4,
warmup_steps=5,
num_train_epochs=2,
learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=1,
save_steps=100,
save_total_limit=4, # Limit the total number of checkpoints
evaluation_strategy="steps",
eval_steps=100,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
load_best_model_at_end=True,
save_strategy="steps",
),
)
# Print GPU information
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
# Train the model
trainer_stats = trainer.train()
# Print memory usage statistics
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
# Save the trained model and tokenizer
print("Saving model to local")
model.save_pretrained("coedit-tinyllama-chat-bnb-4bit") # Local saving
tokenizer.save_pretrained("coedit-tinyllama-chat-bnb-4bit")
# Evaluate the model (Optional)
#trainer.evaluate() |