## Multiple GPUS

In [None]:

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datautils import MyTrainDataset
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import argparse
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

def ddp_setup(rank, world_size):
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "12355"
 init_process_group(backend="nccl", rank=rank, world_size=world_size)
 torch.cuda.set_device(rank)

class Trainer:
 def __init__(self, model, train_data, optimizer, gpu_id, save_every):
 self.gpu_id = gpu_id
 self.model = model.to(gpu_id)
 self.train_data = train_data
 self.optimizer = optimizer
 self.save_every = save_every
 f.model = DDP(model, device_ids=[gpu_id])

 def _run_batch(self, source, targets):
 self.optimizer.zero_grad()
 output = self.model(source)
 loss = F.cross_entropy(output, targets)
 loss.backward()
 self.optimizer.step()

 def _run_epoch(self, epoch):
 b_sz = len(next(iter(self.train_data))[0])
 print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
 self.train_data.sampler.set_epoch(epoch)
 for source, targets in self.train_data:
 source = source.to(self.gpu_id)
 targets = targets.to(self.gpu_id)
 self._run_batch(source, targets)

 def _save_checkpoint(self, epoch):
 ckp = self.model.module.state_dict()
 PATH = "checkpoint.pt"
 torch.save(ckp, PATH)
 print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

 def train(self, max_epochs):
 for epoch in range(max_epochs):
 self._run_epoch(epoch)
 if self.gpu_id == 0 and epoch % self.save_every == 0:
 self._save_checkpoint(epoch)

def load_train_objs():
 dataset_name = "ruslanmv/ai-medical-dataset"
 dataset = load_dataset(dataset_name, split="train")
 dataset = dataset.select(range(100))

 model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token

 bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.float16,
 )
 model = AutoModelForCausalLM.from_pretrained(
 model_name,
 quantization_config=bnb_config,
 trust_remote_code=True,
 use_cache=False,
 device_map="auto",
 )

 lora_alpha = 16
 lora_dropout = 0.1
 lora_r = 32
 peft_config = LoraConfig(
 lora_alpha=lora_alpha,
 lora_dropout=lora_dropout,
 r=lora_r,
 bias="none",
 task_type="CAUSAL_LM",
 target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
 modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm", "norm"],
 )

 max_seq_length = 512
 output_dir = "./results"
 per_device_train_batch_size = 2
 gradient_accumulation_steps = 2
 optim = "adamw_torch"
 save_steps = 10
 logging_steps = 1
 learning_rate = 2e-4
 max_grad_norm = 0.3
 max_steps = 1
 warmup_ratio = 0.1
 lr_scheduler_type = "cosine"

 training_arguments = TrainingArguments(
 output_dir=output_dir,
 per_device_train_batch_size=per_device_train_batch_size,
 gradient_accumulation_steps=gradient_accumulation_steps,
 optim=optim,
 save_steps=save_steps,
 logging_steps=logging_steps,
 learning_rate=learning_rate,
 fp16=True,
 max_grad_norm=max_grad_norm,
 max_steps=max_steps,
 warmup_ratio=warmup_ratio,
 group_by_length=True,
 lr_scheduler_type=lr_scheduler_type,
 gradient_checkpointing=True,
 )

 return dataset, model, peft_config, tokenizer, training_arguments

def prepare_dataloader(dataset, batch_size):
 return DataLoader(
 dataset,
 batch_size=batch_size,
 pin_memory=True,
 shuffle=False,
 sampler=DistributedSampler(dataset),
 )

def main(rank, world_size, save_every, total_epochs, batch_size):
 ddp_setup(rank, world_size)
 dataset, model, peft_config, tokenizer, training_arguments = load_train_objs()
 train_data = prepare_dataloader(dataset, batch_size)
 trainer = SFTTrainer(
 model=model,
 train_dataset=dataset,
 peft_config=peft_config,
 dataset_text_field="context",
 max_seq_length=max_seq_length,
 tokenizer=tokenizer,
 args=training_arguments,
 )
 trainer = Trainer(model, train_data, optimizer=trainer.optimizer, gpu_id=rank, save_every=save_every)
 trainer.train(total_epochs)
 destroy_process_group()

TOTAL_EPOCHS = 10
SAVE_EVERY = 2
BATCH_SIZE = 32

if __name__ == "__main__":
 world_size = torch.cuda.device_count()
 mp.set_start_method("spawn", force=True) # Add this line
 mp.spawn(main, args=(world_size, SAVE_EVERY, TOTAL_EPOCHS, BATCH_SIZE), nprocs=world_size)


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import argparse
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
def ddp_setup(rank, world_size):
 """
 Args:
 rank: Unique identifier of each process
 world_size: Total number of processes
 """
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "12355"
 init_process_group(backend="nccl", rank=rank, world_size=world_size)
 torch.cuda.set_device(rank)

class Trainer:
 def __init__(self, model, train_data, optimizer, gpu_id, save_every):
 self.gpu_id = gpu_id
 self.model = model.to(gpu_id)
 self.train_data = train_data
 self.optimizer = optimizer
 self.save_every = save_every
 self.model = DDP(model, device_ids=[gpu_id])

 def _run_batch(self, source, targets):
 self.optimizer.zero_grad()
 output = self.model(source)
 loss = F.cross_entropy(output, targets)
 loss.backward()
 self.optimizer.step()

 def _run_epoch(self, epoch):
 b_sz = len(next(iter(self.train_data))[0])
 print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
 self.train_data.sampler.set_epoch(epoch)
 for source, targets in self.train_data:
 source = source.to(self.gpu_id)
 targets = targets.to(self.gpu_id)
 self._run_batch(source, targets)

 def _save_checkpoint(self, epoch):
 ckp = self.model.module.statt()
 PATH = "checkpoint.pt"
 torch.save(ckp, PATH)
 print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

 def train(self, max_epochs):
 for epoch in range(max_epochs):
 self._run_epoch(epoch)
 if self.gpu_id == 0 and epoch % self.save_every == 0:
 self._save_checkpoint(epoch)

def load_train_objs():
 dataset_name = "ruslanmv/ai-medical-dataset"
 dataset = load_dataset(dataset_name, split="train")
 dataset = dataset.select(range(100))

 model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token

 bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.float16,
 )
 model = AutoModelForCausalLM.from_pretrained(
 model_name,
 quantization_config=bnb_config,
 trust_remote_code=True,
 use_cache=False,
 device_map="auto",
 )

 lora_alpha = 16
 lora_dropout = 0.1
 lora_r = 32
 peft_config = LoraConfig(
 lora_alpha=lora_alpha,
 lora_dropout=lora_dropout,
 r=lora_r,
 bias="none",
 task_type="CAUSAL_LM",
 target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
 modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm", "norm"],
 )

 max_seq_length = 512
 output_dir = "./results"
 per_device_train_batch_size = 2
 gradient_accumulation_steps = 2
 optim = "adamw_torch"
 save_steps = 10
 logging_steps = 1
 learning_rate = 2e-4
 max_grad_norm = 0.3
 max_steps = 1
 warmup_ratio = 0.1
 lr_scheduler_type = "cosine"

 training_arguments = TrainingArguments(
 output_dir=output_dir,
 per_device_train_batch_size=per_device_train_batch_size,
 gradient_accumulation_steps=gradient_accumulation_steps,
 optim=optim,
 save_steps=save_steps,
 logging_steps=logging_steps,
 learning_rate=learning_rate,
 fp16=True,
 max_grad_norm=max_grad_norm,
 max_steps=max_steps,
 warmup_ratio=warmup_ratio,
 group_by_length=True,
 lr_scheduler_type=lr_scheduler_type,
 gradient_checkpointing=True,
 )

 return dataset, model, peft_config, tokenizer, training_arguments

def prepare_dataloader(dataset, batch_size):
 return DataLoader(
 dataset,
 batch_size=batch_size,
 pin_memory=True,
 shuffle=False,
 sampler=DistributedSampler(dataset),
 )

In [None]:
import torch
import torch.multiprocessing as mp

def main(rank, world_size):
 # Define the parameters as constants
 TOTAL_EPOCHS = 10
 SAVE_EVERY = 2
 BATCH_SIZE = 32
 torch.cuda.init()
 ddp_setup(rank, world_size) 
 dataset, model, peft_config, tokenizer, training_arguments = load_train_objs()
 train_data = prepare_dataloader(dataset, BATCH_SIZE) # Corrected batch_size variable
 trainer = SFTTrainer(
 model=model,
 train_dataset=dataset,
 peft_config=peft_config,
 dataset_text_field="context",
 max_seq_length=max_seq_length,
 tokenizer=tokenizer,
 args=training_arguments,
 )
 trainer = Trainer(model, train_data, optimizer=trainer.optimizer, gpu_id=rank, save_every=SAVE_EVERY)
 trainer.train(TOTAL_EPOCHS)
 destroy_process_group()

if __name__ == "__main__":
 mp.set_start_method('spawn') # Set start method to 'spawn'
 world_size = torch.cuda.device_count()

 # Workaround for Jupyter Notebook and interactive environments
 processes = []
 for rank in range(world_size):
 p = mp.Process(target=main, args=(rank, world_size))
 p.start()
 processes.append(p)

 for p in processes:
 p.join()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import os
import socket

# Distributed training setup (assuming all GPUs are available on a single machine)
def init_distributed(rank, world_size):
 """Initializes distributed training using `nccl` backend."""
 if rank == 0:
 os.environ["MASTER_ADDR"] = socket.gethostname() # Set MASTER_ADDR using rank 0's hostname
 else:
 # Wait a bit to ensure MASTER_ADDR is set before other ranks try to use it
 import time
 time.sleep(5)
 os.environ["MASTER_PORT"] = "12345" # Set MASTER_PORT environment variable
 os.environ["RANK"] = str(rank) # Set RANK environment variable
 os.environ["WORLD_SIZE"] = str(world_size) # Set WORLD_SIZE environment variable
 torch.distributed.init_process_group(backend='nccl', init_method='env://')

# Cleanup after training
def cleanup_distributed():
 if torch.distributed.is_initialized():
 torch.distributed.destroy_process_group()

# Model and tokenizer selection
model_name = "facebook/bart-base" # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Dataset loading (replace with your dataset and field names)
dataset = load_dataset("glue", "mnli", split="train")
text_field = "premise" # Assuming premise is the field containing text for prediction

# Training arguments (adjust hyperparameters as needed)
training_args = TrainingArguments(
 output_dir="./results",
 per_device_train_batch_size=2, # Adjust based on GPU memory (might need to adjust)
 save_steps=500,
 save_total_limit=2,
 num_train_epochs=3, # Adjust training time as needed
)

world_size = torch.cuda.device_count()
if world_size > 1:
 # Initialize distributed training
 init_distributed(rank=0, world_size=world_size) # Rank is assumed to be 0 here

 # Wrap model in DDP for distributed training
 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[torch.cuda.current_device()])

 # Create SFTTrainer with distributed settings
 trainer = SFTTrainer(
 model=model,
 args=training_args, # Pass training_args as 'args' instead of 'training_args'
 train_dataset=dataset,
 dataset_text_field=text_field,
 compute_metrics=None, # You can define your custom metrics here
 )
 print("Trainer For distributed training loaded")
else:
 # For single-GPU training
 trainer = SFTTrainer(
 model=model,
 args=training_args, # Pass training_args as 'args' instead of 'training_args'
 train_dataset=dataset,
 dataset_text_field=text_field,
 compute_metrics=None, # You can define your custom metrics here
 )
 print("Trainer For single-GPU loaded")

# Start training
trainer.train()

# Cleanup after training
cleanup_distributed()


In [None]:
import os
import torch
import torch.multiprocessing as mp
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
from torch.nn.parallel import DistributedDataParallel as DDP


# Distributed training setup
def init_distributed():
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "12345"
 torch.distributed.init_process_group(backend='nccl', world_size=torch.cuda.device_count(), rank=rank)

def cleanup_distributed():
 torch.distributed.destroy_process_group()

def main_worker(rank, world_size):
 init_distributed()

 # Your model training and fine-tuning code goes here
 # Load the dataset
 dataset_name = "ruslanmv/ai-medical-dataset"
 dataset = load_dataset(dataset_name, split="train")
 # Select the first 1M rows of the dataset
 dataset = dataset.select(range(100))

 # Load the model + tokenizer
 model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
 bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.float16,
 )
 model = AutoModelForCausalLM.from_pretrained(
 model_name,
 quantization_config=bnb_config,
 trust_remote_code=True,
 use_cache=False,
 )

 # Check for available GPUs
 device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")

 # PEFT config
 lora_alpha = 1
 lora_dropout = 0.1
 lora_r = 32 # 64
 peft_config = LoraConfig(
 lora_alpha=lora_alpha,
 lora_dropout=lora_dropout,
 task_type="CAUSAL_LM",
 target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
 modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm", "norm"],
 )

 # Args
 max_seq_length = 512
 output_dir = "./results"
 per_device_train_batch_size = 2 # reduced batch size to avoid OOM
 gradient_accumulation_steps = 2
 optim = "adamw_torch"
 save_steps = 10
 logging_steps = 1
 learning_rate = 2e-4
 max_grad_norm = 0.3
 max_steps = 1 # 300 Approx the size of guanaco at bs 8, ga 2, 2 GPUs.
 warmup_ratio = 0.1
 lr_scheduler_type = "cosine"
 training_arguments = TrainingArguments(
 output_dir=output_dir,
 per_device_train_batch_size=per_device_train_batch_size,
 gradient_accumulation_steps=gradient_accumulation_steps,
 optim=optim,
 save_steps=save_steps,
 logging_steps=logging_steps,
 learning_rate=learning_rate,
 fp16=True,
 max_grad_norm=max_grad_norm,
 max_steps=max_steps,
 warmup_ratio=warmup_ratio,
 group_by_length=True,
 lr_scheduler_type=lr_scheduler_type,
 gradient_checkpointing=True, # gradient checkpointing
 #report_to="wandb",
 )

 # Trainer
 trainer = SFTTrainer(
 model=model,
 train_dataset=dataset,
 peft_config=peft_config,
 dataset_text_field="context",
 max_seq_length=max_seq_length,
 tokenizer=tokenizer,
 args=training_arguments,
 )

 # Train :)
 trainer.train()
 cleanup_distributed()


if __name__ == "__main__":
 world_size = torch.cuda.device_count()
 mp.set_start_method('spawn') # Add this line to fix the error
 processes = []
 for rank in range(world_size):
 p = mp.Process(target=main_worker, args=(rank, world_size))
 p.start()
 processes.append(p)
 for p in processes:
 p.join()


In [None]:
def finetune():
 from datasets import load_dataset
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
 from peft import LoraConfig
 from trl import SFTTrainer
 from transformers import TrainingArguments
 from torch.nn.parallel import DistributedDataParallel as DDP
 # Load the dataset
 dataset_name = "ruslanmv/ai-medical-dataset"
 dataset = load_dataset(dataset_name, split="train")
 # Select the first 1M rows of the dataset
 dataset = dataset.select(range(100))
 # Load the model + tokenizer
 model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
 bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.float16,
 )
 model = AutoModelForCausalLM.from_pretrained(
 model_name,
 quantization_config=bnb_config,
 trust_remote_code=True,
 use_cache=False,
 )
 # Check for available GPUs
 if torch.cuda.device_count() > 1:
 print("Multiple GPUs detected, enabling DataParallel...")
 model = DDP(model) # Wrap the model with DDP
 else:
 print("Using single GPU...")
 # PEFT config
 lora_alpha = 16
 lora_dropout = 0.1
 lora_r = 32 # 64
 peft_config = LoraConfig(
 lora_alpha=lora_alpha,
 lora_dropout=lora_dropout,
 r=lora_r,
 bias="none",
 task_type="CAUSAL_LM",
 target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
 modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm", "norm"],
 )
 # Args
 max_seq_length = 512
 output_dir = "./results"
 per_device_train_batch_size = 2 # reduced batch size to avoid OOM
 gradient_accumulation_steps = 2
 optim = "adamw_torch"
 save_steps = 10
 logging_steps = 1
 learning_rate = 2e-4
 max_grad_norm = 0.3
 max_steps = 1 # 300 Approx the size of guanaco at bs 8, ga 2, 2 GPUs.
 warmup_ratio = 0.1
 lr_scheduler_type = "cosine"

 training_arguments = TrainingArguments(
 output_dir=output_dir,
 per_device_train_batch_size=per_device_train_batch_size,
 gradient_accumulation_steps=gradient_accumulation_steps,
 optim=optim,
 save_steps=save_steps,
 logging_steps=logging_steps,
 learning_rate=learning_rate,
 fp16=True,
 max_grad_norm=max_grad_norm,
 max_steps=max_steps,
 warmup_ratio=warmup_ratio,
 group_by_length=True,
 lr_scheduler_type=lr_scheduler_type,
 gradient_checkpointing=True, # gradient checkpointing
 #report_to="wandb",
 )
 # Trainer
 trainer = SFTTrainer(
 model=model,
 train_dataset=dataset,
 peft_config=peft_config,
 dataset_text_field="context",
 max_seq_length=max_seq_length,
 tokenizer=tokenizer,
 args=training_arguments,
 )
 # Train :)
 trainer.train()

In [None]:
import os
import torch
import torch.multiprocessing as mp

def init_distributed(rank, world_size, local_rank=0): # Add local_rank argument
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "12345" # Adjust port if needed
 if rank == 0:
 print("Initializing distributed process group...")
 torch.distributed.init_process_group(backend='nccl', world_size=world_size, rank=rank)
 torch.cuda.set_device(local_rank) # Set unique GPU device for each process

def cleanup_distributed():
 torch.distributed.destroy_process_group()

def main_worker(rank, world_size):
 local_rank = rank % torch.cuda.device_count() # Assign unique local rank
 init_distributed(rank, world_size, local_rank)
 # Your model training and fine-tuning code goes here with model on local_rank GPU
 finetune() # Move model to assigned GPU
 cleanup_distributed()
if __name__ == "__main__":
 world_size = torch.cuda.device_count()

 # Workaround for Jupyter Notebook and interactive environments
 processes = []
 for rank in range(world_size):
 p = mp.Process(target=main_worker, args=(rank, world_size))
 p.start()
 processes.append(p)

 for p in processes:
 p.join()
