Ali-Forootani
commited on
Commit
•
1e5f944
1
Parent(s):
22684ef
Update README.md
Browse files
README.md
CHANGED
@@ -90,39 +90,302 @@ After training, the model is saved to the specified directory (`new_model`). Thi
|
|
90 |
|
91 |
Here’s an example configuration used for fine-tuning:
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
|
|
|
|
|
96 |
new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
|
97 |
|
|
|
|
|
|
|
|
|
|
|
98 |
lora_r = 64
|
|
|
|
|
99 |
lora_alpha = 16
|
|
|
|
|
100 |
lora_dropout = 0.1
|
101 |
|
|
|
|
|
|
|
|
|
|
|
102 |
use_4bit = True
|
|
|
|
|
103 |
bnb_4bit_compute_dtype = "float16"
|
|
|
|
|
104 |
bnb_4bit_quant_type = "nf4"
|
|
|
|
|
105 |
use_nested_quant = False
|
106 |
|
|
|
|
|
|
|
|
|
|
|
107 |
output_dir = "./results"
|
|
|
|
|
108 |
num_train_epochs = 300
|
|
|
|
|
109 |
fp16 = False
|
110 |
bf16 = False
|
|
|
|
|
111 |
per_device_train_batch_size = 4
|
|
|
|
|
|
|
|
|
|
|
112 |
gradient_accumulation_steps = 1
|
|
|
|
|
113 |
gradient_checkpointing = True
|
|
|
|
|
114 |
max_grad_norm = 0.3
|
|
|
|
|
115 |
learning_rate = 2e-4
|
|
|
|
|
116 |
weight_decay = 0.001
|
|
|
|
|
117 |
optim = "paged_adamw_32bit"
|
|
|
|
|
118 |
lr_scheduler_type = "cosine"
|
|
|
|
|
119 |
max_steps = -1
|
|
|
|
|
120 |
warmup_ratio = 0.03
|
|
|
|
|
|
|
121 |
group_by_length = True
|
|
|
|
|
122 |
save_steps = 0
|
|
|
|
|
123 |
logging_steps = 25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
```
|
125 |
|
|
|
|
|
|
|
|
|
126 |
## License
|
127 |
|
128 |
This repository is licensed under the [MIT License](LICENSE).
|
|
|
90 |
|
91 |
Here’s an example configuration used for fine-tuning:
|
92 |
|
93 |
+
_hint_: the base model is: NousResearch/Llama-2-7b-chat-hf
|
94 |
+
_hint_: the dataset is: mlabonne/guanaco-llama2-1k
|
95 |
+
|
96 |
+
_hint_: I saved them on my local machine then laod them! you can directly download them from huggingface
|
97 |
+
|
98 |
+
```python
|
99 |
+
model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf" # the base model is: NousResearch/Llama-2-7b-chat-hf
|
100 |
+
dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k" # the dataset is: mlabonne/guanaco-llama2-1k
|
101 |
+
new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
|
102 |
+
|
103 |
+
lora_r = 64
|
104 |
+
lora_alpha = 16
|
105 |
+
lora_dropout = 0.1
|
106 |
+
|
107 |
+
use_4bit = True
|
108 |
+
bnb_4bit_compute_dtype = "float16"
|
109 |
+
bnb_4bit_quant_type = "nf4"
|
110 |
+
use_nested_quant = False
|
111 |
+
|
112 |
+
output_dir = "./results"
|
113 |
+
num_train_epochs = 300
|
114 |
+
fp16 = False
|
115 |
+
bf16 = False
|
116 |
+
per_device_train_batch_size = 4
|
117 |
+
gradient_accumulation_steps = 1
|
118 |
+
gradient_checkpointing = True
|
119 |
+
max_grad_norm = 0.3
|
120 |
+
learning_rate = 2e-4
|
121 |
+
weight_decay = 0.001
|
122 |
+
optim = "paged_adamw_32bit"
|
123 |
+
lr_scheduler_type = "cosine"
|
124 |
+
max_steps = -1
|
125 |
+
warmup_ratio = 0.03
|
126 |
+
group_by_length = True
|
127 |
+
save_steps = 0
|
128 |
+
logging_steps = 25
|
129 |
+
```
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
# The entire Python training module:
|
134 |
+
|
135 |
```python
|
136 |
+
|
137 |
+
|
138 |
+
import os
|
139 |
+
import torch
|
140 |
+
from datasets import load_dataset
|
141 |
+
from transformers import (
|
142 |
+
AutoModelForCausalLM,
|
143 |
+
AutoTokenizer,
|
144 |
+
BitsAndBytesConfig,
|
145 |
+
HfArgumentParser,
|
146 |
+
TrainingArguments,
|
147 |
+
pipeline,
|
148 |
+
logging,
|
149 |
+
)
|
150 |
+
from peft import LoraConfig, PeftModel
|
151 |
+
from trl import SFTTrainer
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
import sys
|
156 |
+
import os
|
157 |
+
|
158 |
+
cwd = os.getcwd()
|
159 |
+
# sys.path.append(cwd + '/my_directory')
|
160 |
+
sys.path.append(cwd)
|
161 |
+
|
162 |
+
|
163 |
+
def setting_directory(depth):
|
164 |
+
current_dir = os.path.abspath(os.getcwd())
|
165 |
+
root_dir = current_dir
|
166 |
+
for i in range(depth):
|
167 |
+
root_dir = os.path.abspath(os.path.join(root_dir, os.pardir))
|
168 |
+
sys.path.append(os.path.dirname(root_dir))
|
169 |
+
return root_dir
|
170 |
+
|
171 |
+
#################################
|
172 |
+
#S:\Llavar_repo\LLaVA\NousResearch\Llama-2-7b-chat-hf
|
173 |
+
|
174 |
+
# The model that you want to train from the Hugging Face hub
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
|
179 |
+
|
180 |
+
|
181 |
+
#model_name = setting_directory(2) + "\\Llavar_repo\\LLaVA\NousResearch\\Llama-2-7b-chat-hf"
|
182 |
+
|
183 |
+
|
184 |
+
|
185 |
+
# The instruction dataset to use
|
186 |
dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
|
187 |
+
|
188 |
+
# Fine-tuned model name
|
189 |
new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
|
190 |
|
191 |
+
################################################################################
|
192 |
+
# QLoRA parameters
|
193 |
+
################################################################################
|
194 |
+
|
195 |
+
# LoRA attention dimension
|
196 |
lora_r = 64
|
197 |
+
|
198 |
+
# Alpha parameter for LoRA scaling
|
199 |
lora_alpha = 16
|
200 |
+
|
201 |
+
# Dropout probability for LoRA layers
|
202 |
lora_dropout = 0.1
|
203 |
|
204 |
+
################################################################################
|
205 |
+
# bitsandbytes parameters
|
206 |
+
################################################################################
|
207 |
+
|
208 |
+
# Activate 4-bit precision base model loading
|
209 |
use_4bit = True
|
210 |
+
|
211 |
+
# Compute dtype for 4-bit base models
|
212 |
bnb_4bit_compute_dtype = "float16"
|
213 |
+
|
214 |
+
# Quantization type (fp4 or nf4)
|
215 |
bnb_4bit_quant_type = "nf4"
|
216 |
+
|
217 |
+
# Activate nested quantization for 4-bit base models (double quantization)
|
218 |
use_nested_quant = False
|
219 |
|
220 |
+
################################################################################
|
221 |
+
# TrainingArguments parameters
|
222 |
+
################################################################################
|
223 |
+
|
224 |
+
# Output directory where the model predictions and checkpoints will be stored
|
225 |
output_dir = "./results"
|
226 |
+
|
227 |
+
# Number of training epochs
|
228 |
num_train_epochs = 300
|
229 |
+
|
230 |
+
# Enable fp16/bf16 training (set bf16 to True with an A100)
|
231 |
fp16 = False
|
232 |
bf16 = False
|
233 |
+
|
234 |
+
# Batch size per GPU for training
|
235 |
per_device_train_batch_size = 4
|
236 |
+
|
237 |
+
# Batch size per GPU for evaluation
|
238 |
+
per_device_eval_batch_size = 4
|
239 |
+
|
240 |
+
# Number of update steps to accumulate the gradients for
|
241 |
gradient_accumulation_steps = 1
|
242 |
+
|
243 |
+
# Enable gradient checkpointing
|
244 |
gradient_checkpointing = True
|
245 |
+
|
246 |
+
# Maximum gradient normal (gradient clipping)
|
247 |
max_grad_norm = 0.3
|
248 |
+
|
249 |
+
# Initial learning rate (AdamW optimizer)
|
250 |
learning_rate = 2e-4
|
251 |
+
|
252 |
+
# Weight decay to apply to all layers except bias/LayerNorm weights
|
253 |
weight_decay = 0.001
|
254 |
+
|
255 |
+
# Optimizer to use
|
256 |
optim = "paged_adamw_32bit"
|
257 |
+
|
258 |
+
# Learning rate schedule
|
259 |
lr_scheduler_type = "cosine"
|
260 |
+
|
261 |
+
# Number of training steps (overrides num_train_epochs)
|
262 |
max_steps = -1
|
263 |
+
|
264 |
+
# Ratio of steps for a linear warmup (from 0 to learning rate)
|
265 |
warmup_ratio = 0.03
|
266 |
+
|
267 |
+
# Group sequences into batches with same length
|
268 |
+
# Saves memory and speeds up training considerably
|
269 |
group_by_length = True
|
270 |
+
|
271 |
+
# Save checkpoint every X updates steps
|
272 |
save_steps = 0
|
273 |
+
|
274 |
+
# Log every X updates steps
|
275 |
logging_steps = 25
|
276 |
+
|
277 |
+
################################################################################
|
278 |
+
# SFT parameters
|
279 |
+
################################################################################
|
280 |
+
|
281 |
+
# Maximum sequence length to use
|
282 |
+
max_seq_length = None
|
283 |
+
|
284 |
+
# Pack multiple short examples in the same input sequence to increase efficiency
|
285 |
+
packing = False
|
286 |
+
|
287 |
+
# Load the entire model on the GPU 0
|
288 |
+
device_map = {"": 0}
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
################################################################################
|
293 |
+
|
294 |
+
|
295 |
+
# Load dataset (you can process it here)
|
296 |
+
dataset = load_dataset(dataset_name, split="train")
|
297 |
+
|
298 |
+
print(dataset[0].keys()) # This will print all the field names in your dataset
|
299 |
+
|
300 |
+
# Load tokenizer and model with QLoRA configuration
|
301 |
+
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
|
302 |
+
|
303 |
+
bnb_config = BitsAndBytesConfig(
|
304 |
+
load_in_4bit=use_4bit,
|
305 |
+
bnb_4bit_quant_type=bnb_4bit_quant_type,
|
306 |
+
bnb_4bit_compute_dtype=compute_dtype,
|
307 |
+
bnb_4bit_use_double_quant=use_nested_quant,
|
308 |
+
)
|
309 |
+
|
310 |
+
# Check GPU compatibility with bfloat16
|
311 |
+
if compute_dtype == torch.float16 and use_4bit:
|
312 |
+
major, _ = torch.cuda.get_device_capability()
|
313 |
+
if major >= 8:
|
314 |
+
print("=" * 80)
|
315 |
+
print("Your GPU supports bfloat16: accelerate training with bf16=True")
|
316 |
+
print("=" * 80)
|
317 |
+
|
318 |
+
# Load base model
|
319 |
+
model = AutoModelForCausalLM.from_pretrained(
|
320 |
+
model_name,
|
321 |
+
quantization_config=bnb_config,
|
322 |
+
device_map=device_map
|
323 |
+
)
|
324 |
+
model.config.use_cache = False
|
325 |
+
model.config.pretraining_tp = 1
|
326 |
+
|
327 |
+
# Load LLaMA tokenizer
|
328 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
329 |
+
tokenizer.pad_token = tokenizer.eos_token
|
330 |
+
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
|
331 |
+
|
332 |
+
# Load LoRA configuration
|
333 |
+
peft_config = LoraConfig(
|
334 |
+
lora_alpha=lora_alpha,
|
335 |
+
lora_dropout=lora_dropout,
|
336 |
+
r=lora_r,
|
337 |
+
bias="none",
|
338 |
+
task_type="CAUSAL_LM",
|
339 |
+
)
|
340 |
+
|
341 |
+
# Set training parameters
|
342 |
+
training_arguments = TrainingArguments(
|
343 |
+
output_dir=output_dir,
|
344 |
+
num_train_epochs=num_train_epochs,
|
345 |
+
per_device_train_batch_size=per_device_train_batch_size,
|
346 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
347 |
+
optim=optim,
|
348 |
+
save_steps=save_steps,
|
349 |
+
logging_steps=logging_steps,
|
350 |
+
learning_rate=learning_rate,
|
351 |
+
weight_decay=weight_decay,
|
352 |
+
fp16=fp16,
|
353 |
+
bf16=bf16,
|
354 |
+
max_grad_norm=max_grad_norm,
|
355 |
+
max_steps=max_steps,
|
356 |
+
warmup_ratio=warmup_ratio,
|
357 |
+
group_by_length=group_by_length,
|
358 |
+
lr_scheduler_type=lr_scheduler_type,
|
359 |
+
report_to="tensorboard"
|
360 |
+
)
|
361 |
+
|
362 |
+
# Set supervised fine-tuning parameters
|
363 |
+
|
364 |
+
def preprocess_function(examples):
|
365 |
+
return tokenizer(examples["text"], truncation=True, max_length=512)
|
366 |
+
|
367 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
368 |
+
|
369 |
+
trainer = SFTTrainer(
|
370 |
+
model=model,
|
371 |
+
train_dataset=tokenized_dataset,
|
372 |
+
peft_config=peft_config,
|
373 |
+
tokenizer=tokenizer,
|
374 |
+
args=training_arguments,
|
375 |
+
packing=packing,
|
376 |
+
)
|
377 |
+
|
378 |
+
# Train model
|
379 |
+
trainer.train()
|
380 |
+
|
381 |
+
# Save trained model
|
382 |
+
trainer.model.save_pretrained(new_model)
|
383 |
```
|
384 |
|
385 |
+
|
386 |
+
|
387 |
+
|
388 |
+
|
389 |
## License
|
390 |
|
391 |
This repository is licensed under the [MIT License](LICENSE).
|