Ali-Forootani
/

llama-2-7b-miniguanaco

Transformers

Safetensors

Inference Endpoints

Model card Files Files and versions Community

Ali-Forootani commited on Aug 12

Commit

1e5f944

•

1 Parent(s): 22684ef

Update README.md

Browse files

Files changed (1) hide show

README.md +263 -0

README.md CHANGED Viewed

@@ -90,39 +90,302 @@ After training, the model is saved to the specified directory (`new_model`). Thi
 Here’s an example configuration used for fine-tuning:
 ```python
 model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
 dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
 new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
 lora_r = 64
 lora_alpha = 16
 lora_dropout = 0.1
 use_4bit = True
 bnb_4bit_compute_dtype = "float16"
 bnb_4bit_quant_type = "nf4"
 use_nested_quant = False
 output_dir = "./results"
 num_train_epochs = 300
 fp16 = False
 bf16 = False
 per_device_train_batch_size = 4
 gradient_accumulation_steps = 1
 gradient_checkpointing = True
 max_grad_norm = 0.3
 learning_rate = 2e-4
 weight_decay = 0.001
 optim = "paged_adamw_32bit"
 lr_scheduler_type = "cosine"
 max_steps = -1
 warmup_ratio = 0.03
 group_by_length = True
 save_steps = 0
 logging_steps = 25
 ```
 ## License
 This repository is licensed under the [MIT License](LICENSE).

 Here’s an example configuration used for fine-tuning:
+_hint_: the base model is: NousResearch/Llama-2-7b-chat-hf
+_hint_: the dataset is: mlabonne/guanaco-llama2-1k
+_hint_: I saved them on my local machine then laod them! you can directly download them from huggingface
+```python
+model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf" # the base model is: NousResearch/Llama-2-7b-chat-hf
+dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k" # the dataset is: mlabonne/guanaco-llama2-1k
+new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
+lora_r = 64
+lora_alpha = 16
+lora_dropout = 0.1
+use_4bit = True
+bnb_4bit_compute_dtype = "float16"
+bnb_4bit_quant_type = "nf4"
+use_nested_quant = False
+output_dir = "./results"
+num_train_epochs = 300
+fp16 = False
+bf16 = False
+per_device_train_batch_size = 4
+gradient_accumulation_steps = 1
+gradient_checkpointing = True
+max_grad_norm = 0.3
+learning_rate = 2e-4
+weight_decay = 0.001
+optim = "paged_adamw_32bit"
+lr_scheduler_type = "cosine"
+max_steps = -1
+warmup_ratio = 0.03
+group_by_length = True
+save_steps = 0
+logging_steps = 25
+```
+# The entire Python training module:
 ```python
+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel
+from trl import SFTTrainer
+import sys
+import os
+cwd = os.getcwd()
+# sys.path.append(cwd + '/my_directory')
+sys.path.append(cwd)
+def setting_directory(depth):
+    current_dir = os.path.abspath(os.getcwd())
+    root_dir = current_dir
+    for i in range(depth):
+        root_dir = os.path.abspath(os.path.join(root_dir, os.pardir))
+        sys.path.append(os.path.dirname(root_dir))
+    return root_dir
+#################################
+#S:\Llavar_repo\LLaVA\NousResearch\Llama-2-7b-chat-hf
+# The model that you want to train from the Hugging Face hub
 model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
+#model_name = setting_directory(2) + "\\Llavar_repo\\LLaVA\NousResearch\\Llama-2-7b-chat-hf"
+# The instruction dataset to use
 dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
+# Fine-tuned model name
 new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
+################################################################################
+# QLoRA parameters
+################################################################################
+# LoRA attention dimension
 lora_r = 64
+# Alpha parameter for LoRA scaling
 lora_alpha = 16
+# Dropout probability for LoRA layers
 lora_dropout = 0.1
+################################################################################
+# bitsandbytes parameters
+################################################################################
+# Activate 4-bit precision base model loading
 use_4bit = True
+# Compute dtype for 4-bit base models
 bnb_4bit_compute_dtype = "float16"
+# Quantization type (fp4 or nf4)
 bnb_4bit_quant_type = "nf4"
+# Activate nested quantization for 4-bit base models (double quantization)
 use_nested_quant = False
+################################################################################
+# TrainingArguments parameters
+################################################################################
+# Output directory where the model predictions and checkpoints will be stored
 output_dir = "./results"
+# Number of training epochs
 num_train_epochs = 300
+# Enable fp16/bf16 training (set bf16 to True with an A100)
 fp16 = False
 bf16 = False
+# Batch size per GPU for training
 per_device_train_batch_size = 4
+# Batch size per GPU for evaluation
+per_device_eval_batch_size = 4
+# Number of update steps to accumulate the gradients for
 gradient_accumulation_steps = 1
+# Enable gradient checkpointing
 gradient_checkpointing = True
+# Maximum gradient normal (gradient clipping)
 max_grad_norm = 0.3
+# Initial learning rate (AdamW optimizer)
 learning_rate = 2e-4
+# Weight decay to apply to all layers except bias/LayerNorm weights
 weight_decay = 0.001
+# Optimizer to use
 optim = "paged_adamw_32bit"
+# Learning rate schedule
 lr_scheduler_type = "cosine"
+# Number of training steps (overrides num_train_epochs)
 max_steps = -1
+# Ratio of steps for a linear warmup (from 0 to learning rate)
 warmup_ratio = 0.03
+# Group sequences into batches with same length
+# Saves memory and speeds up training considerably
 group_by_length = True
+# Save checkpoint every X updates steps
 save_steps = 0
+# Log every X updates steps
 logging_steps = 25
+################################################################################
+# SFT parameters
+################################################################################
+# Maximum sequence length to use
+max_seq_length = None
+# Pack multiple short examples in the same input sequence to increase efficiency
+packing = False
+# Load the entire model on the GPU 0
+device_map = {"": 0}
+################################################################################
+# Load dataset (you can process it here)
+dataset = load_dataset(dataset_name, split="train")
+print(dataset[0].keys())  # This will print all the field names in your dataset
+# Load tokenizer and model with QLoRA configuration
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16: accelerate training with bf16=True")
+        print("=" * 80)
+# Load base model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map=device_map
+)
+model.config.use_cache = False
+model.config.pretraining_tp = 1
+# Load LLaMA tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+# Load LoRA configuration
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+# Set training parameters
+training_arguments = TrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_steps=save_steps,
+    logging_steps=logging_steps,
+    learning_rate=learning_rate,
+    weight_decay=weight_decay,
+    fp16=fp16,
+    bf16=bf16,
+    max_grad_norm=max_grad_norm,
+    max_steps=max_steps,
+    warmup_ratio=warmup_ratio,
+    group_by_length=group_by_length,
+    lr_scheduler_type=lr_scheduler_type,
+    report_to="tensorboard"
+)
+# Set supervised fine-tuning parameters
+def preprocess_function(examples):
+    return tokenizer(examples["text"], truncation=True, max_length=512)
+tokenized_dataset = dataset.map(preprocess_function, batched=True)
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=tokenized_dataset,
+    peft_config=peft_config,
+    tokenizer=tokenizer,
+    args=training_arguments,
+    packing=packing,
+)
+# Train model
+trainer.train()
+# Save trained model
+trainer.model.save_pretrained(new_model)
 ```
 ## License
 This repository is licensed under the [MIT License](LICENSE).