Ali-Forootani commited on
Commit
1e5f944
1 Parent(s): 22684ef

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +263 -0
README.md CHANGED
@@ -90,39 +90,302 @@ After training, the model is saved to the specified directory (`new_model`). Thi
90
 
91
  Here’s an example configuration used for fine-tuning:
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
 
 
 
 
 
 
 
95
  dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
 
 
96
  new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
97
 
 
 
 
 
 
98
  lora_r = 64
 
 
99
  lora_alpha = 16
 
 
100
  lora_dropout = 0.1
101
 
 
 
 
 
 
102
  use_4bit = True
 
 
103
  bnb_4bit_compute_dtype = "float16"
 
 
104
  bnb_4bit_quant_type = "nf4"
 
 
105
  use_nested_quant = False
106
 
 
 
 
 
 
107
  output_dir = "./results"
 
 
108
  num_train_epochs = 300
 
 
109
  fp16 = False
110
  bf16 = False
 
 
111
  per_device_train_batch_size = 4
 
 
 
 
 
112
  gradient_accumulation_steps = 1
 
 
113
  gradient_checkpointing = True
 
 
114
  max_grad_norm = 0.3
 
 
115
  learning_rate = 2e-4
 
 
116
  weight_decay = 0.001
 
 
117
  optim = "paged_adamw_32bit"
 
 
118
  lr_scheduler_type = "cosine"
 
 
119
  max_steps = -1
 
 
120
  warmup_ratio = 0.03
 
 
 
121
  group_by_length = True
 
 
122
  save_steps = 0
 
 
123
  logging_steps = 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  ```
125
 
 
 
 
 
126
  ## License
127
 
128
  This repository is licensed under the [MIT License](LICENSE).
 
90
 
91
  Here’s an example configuration used for fine-tuning:
92
 
93
+ _hint_: the base model is: NousResearch/Llama-2-7b-chat-hf
94
+ _hint_: the dataset is: mlabonne/guanaco-llama2-1k
95
+
96
+ _hint_: I saved them on my local machine then laod them! you can directly download them from huggingface
97
+
98
+ ```python
99
+ model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf" # the base model is: NousResearch/Llama-2-7b-chat-hf
100
+ dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k" # the dataset is: mlabonne/guanaco-llama2-1k
101
+ new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
102
+
103
+ lora_r = 64
104
+ lora_alpha = 16
105
+ lora_dropout = 0.1
106
+
107
+ use_4bit = True
108
+ bnb_4bit_compute_dtype = "float16"
109
+ bnb_4bit_quant_type = "nf4"
110
+ use_nested_quant = False
111
+
112
+ output_dir = "./results"
113
+ num_train_epochs = 300
114
+ fp16 = False
115
+ bf16 = False
116
+ per_device_train_batch_size = 4
117
+ gradient_accumulation_steps = 1
118
+ gradient_checkpointing = True
119
+ max_grad_norm = 0.3
120
+ learning_rate = 2e-4
121
+ weight_decay = 0.001
122
+ optim = "paged_adamw_32bit"
123
+ lr_scheduler_type = "cosine"
124
+ max_steps = -1
125
+ warmup_ratio = 0.03
126
+ group_by_length = True
127
+ save_steps = 0
128
+ logging_steps = 25
129
+ ```
130
+
131
+
132
+
133
+ # The entire Python training module:
134
+
135
  ```python
136
+
137
+
138
+ import os
139
+ import torch
140
+ from datasets import load_dataset
141
+ from transformers import (
142
+ AutoModelForCausalLM,
143
+ AutoTokenizer,
144
+ BitsAndBytesConfig,
145
+ HfArgumentParser,
146
+ TrainingArguments,
147
+ pipeline,
148
+ logging,
149
+ )
150
+ from peft import LoraConfig, PeftModel
151
+ from trl import SFTTrainer
152
+
153
+
154
+
155
+ import sys
156
+ import os
157
+
158
+ cwd = os.getcwd()
159
+ # sys.path.append(cwd + '/my_directory')
160
+ sys.path.append(cwd)
161
+
162
+
163
+ def setting_directory(depth):
164
+ current_dir = os.path.abspath(os.getcwd())
165
+ root_dir = current_dir
166
+ for i in range(depth):
167
+ root_dir = os.path.abspath(os.path.join(root_dir, os.pardir))
168
+ sys.path.append(os.path.dirname(root_dir))
169
+ return root_dir
170
+
171
+ #################################
172
+ #S:\Llavar_repo\LLaVA\NousResearch\Llama-2-7b-chat-hf
173
+
174
+ # The model that you want to train from the Hugging Face hub
175
+
176
+
177
+
178
  model_name = "/data/bio-eng-llm/llm_repo/NousResearch/Llama-2-7b-chat-hf"
179
+
180
+
181
+ #model_name = setting_directory(2) + "\\Llavar_repo\\LLaVA\NousResearch\\Llama-2-7b-chat-hf"
182
+
183
+
184
+
185
+ # The instruction dataset to use
186
  dataset_name = "/data/bio-eng-llm/llm_repo/mlabonne/guanaco-llama2-1k"
187
+
188
+ # Fine-tuned model name
189
  new_model = "/data/bio-eng-llm/llm_repo/mlabonne/llama-2-7b-miniguanaco"
190
 
191
+ ################################################################################
192
+ # QLoRA parameters
193
+ ################################################################################
194
+
195
+ # LoRA attention dimension
196
  lora_r = 64
197
+
198
+ # Alpha parameter for LoRA scaling
199
  lora_alpha = 16
200
+
201
+ # Dropout probability for LoRA layers
202
  lora_dropout = 0.1
203
 
204
+ ################################################################################
205
+ # bitsandbytes parameters
206
+ ################################################################################
207
+
208
+ # Activate 4-bit precision base model loading
209
  use_4bit = True
210
+
211
+ # Compute dtype for 4-bit base models
212
  bnb_4bit_compute_dtype = "float16"
213
+
214
+ # Quantization type (fp4 or nf4)
215
  bnb_4bit_quant_type = "nf4"
216
+
217
+ # Activate nested quantization for 4-bit base models (double quantization)
218
  use_nested_quant = False
219
 
220
+ ################################################################################
221
+ # TrainingArguments parameters
222
+ ################################################################################
223
+
224
+ # Output directory where the model predictions and checkpoints will be stored
225
  output_dir = "./results"
226
+
227
+ # Number of training epochs
228
  num_train_epochs = 300
229
+
230
+ # Enable fp16/bf16 training (set bf16 to True with an A100)
231
  fp16 = False
232
  bf16 = False
233
+
234
+ # Batch size per GPU for training
235
  per_device_train_batch_size = 4
236
+
237
+ # Batch size per GPU for evaluation
238
+ per_device_eval_batch_size = 4
239
+
240
+ # Number of update steps to accumulate the gradients for
241
  gradient_accumulation_steps = 1
242
+
243
+ # Enable gradient checkpointing
244
  gradient_checkpointing = True
245
+
246
+ # Maximum gradient normal (gradient clipping)
247
  max_grad_norm = 0.3
248
+
249
+ # Initial learning rate (AdamW optimizer)
250
  learning_rate = 2e-4
251
+
252
+ # Weight decay to apply to all layers except bias/LayerNorm weights
253
  weight_decay = 0.001
254
+
255
+ # Optimizer to use
256
  optim = "paged_adamw_32bit"
257
+
258
+ # Learning rate schedule
259
  lr_scheduler_type = "cosine"
260
+
261
+ # Number of training steps (overrides num_train_epochs)
262
  max_steps = -1
263
+
264
+ # Ratio of steps for a linear warmup (from 0 to learning rate)
265
  warmup_ratio = 0.03
266
+
267
+ # Group sequences into batches with same length
268
+ # Saves memory and speeds up training considerably
269
  group_by_length = True
270
+
271
+ # Save checkpoint every X updates steps
272
  save_steps = 0
273
+
274
+ # Log every X updates steps
275
  logging_steps = 25
276
+
277
+ ################################################################################
278
+ # SFT parameters
279
+ ################################################################################
280
+
281
+ # Maximum sequence length to use
282
+ max_seq_length = None
283
+
284
+ # Pack multiple short examples in the same input sequence to increase efficiency
285
+ packing = False
286
+
287
+ # Load the entire model on the GPU 0
288
+ device_map = {"": 0}
289
+
290
+
291
+
292
+ ################################################################################
293
+
294
+
295
+ # Load dataset (you can process it here)
296
+ dataset = load_dataset(dataset_name, split="train")
297
+
298
+ print(dataset[0].keys()) # This will print all the field names in your dataset
299
+
300
+ # Load tokenizer and model with QLoRA configuration
301
+ compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
302
+
303
+ bnb_config = BitsAndBytesConfig(
304
+ load_in_4bit=use_4bit,
305
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
306
+ bnb_4bit_compute_dtype=compute_dtype,
307
+ bnb_4bit_use_double_quant=use_nested_quant,
308
+ )
309
+
310
+ # Check GPU compatibility with bfloat16
311
+ if compute_dtype == torch.float16 and use_4bit:
312
+ major, _ = torch.cuda.get_device_capability()
313
+ if major >= 8:
314
+ print("=" * 80)
315
+ print("Your GPU supports bfloat16: accelerate training with bf16=True")
316
+ print("=" * 80)
317
+
318
+ # Load base model
319
+ model = AutoModelForCausalLM.from_pretrained(
320
+ model_name,
321
+ quantization_config=bnb_config,
322
+ device_map=device_map
323
+ )
324
+ model.config.use_cache = False
325
+ model.config.pretraining_tp = 1
326
+
327
+ # Load LLaMA tokenizer
328
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
329
+ tokenizer.pad_token = tokenizer.eos_token
330
+ tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
331
+
332
+ # Load LoRA configuration
333
+ peft_config = LoraConfig(
334
+ lora_alpha=lora_alpha,
335
+ lora_dropout=lora_dropout,
336
+ r=lora_r,
337
+ bias="none",
338
+ task_type="CAUSAL_LM",
339
+ )
340
+
341
+ # Set training parameters
342
+ training_arguments = TrainingArguments(
343
+ output_dir=output_dir,
344
+ num_train_epochs=num_train_epochs,
345
+ per_device_train_batch_size=per_device_train_batch_size,
346
+ gradient_accumulation_steps=gradient_accumulation_steps,
347
+ optim=optim,
348
+ save_steps=save_steps,
349
+ logging_steps=logging_steps,
350
+ learning_rate=learning_rate,
351
+ weight_decay=weight_decay,
352
+ fp16=fp16,
353
+ bf16=bf16,
354
+ max_grad_norm=max_grad_norm,
355
+ max_steps=max_steps,
356
+ warmup_ratio=warmup_ratio,
357
+ group_by_length=group_by_length,
358
+ lr_scheduler_type=lr_scheduler_type,
359
+ report_to="tensorboard"
360
+ )
361
+
362
+ # Set supervised fine-tuning parameters
363
+
364
+ def preprocess_function(examples):
365
+ return tokenizer(examples["text"], truncation=True, max_length=512)
366
+
367
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
368
+
369
+ trainer = SFTTrainer(
370
+ model=model,
371
+ train_dataset=tokenized_dataset,
372
+ peft_config=peft_config,
373
+ tokenizer=tokenizer,
374
+ args=training_arguments,
375
+ packing=packing,
376
+ )
377
+
378
+ # Train model
379
+ trainer.train()
380
+
381
+ # Save trained model
382
+ trainer.model.save_pretrained(new_model)
383
  ```
384
 
385
+
386
+
387
+
388
+
389
  ## License
390
 
391
  This repository is licensed under the [MIT License](LICENSE).