Imran1 commited on
Commit
939342b
1 Parent(s): 51d3eef

Update code/inference.py

Browse files
Files changed (1) hide show
  1. code/inference.py +11 -12
code/inference.py CHANGED
@@ -7,6 +7,12 @@ import fcntl # For file locking
7
  import os # For file operations
8
  import time # For sleep function
9
 
 
 
 
 
 
 
10
  # Global variables to persist the model and tokenizer between invocations
11
  model = None
12
  tokenizer = None
@@ -59,17 +65,15 @@ def model_fn(model_dir, context=None):
59
  offload_dir = "/tmp/offload_dir"
60
  os.makedirs(offload_dir, exist_ok=True)
61
 
62
- # Reduce memory fragmentation by setting the max split size
63
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
64
-
65
- # Load and dispatch model across GPUs with tensor parallelism
66
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
67
  model = load_checkpoint_and_dispatch(
68
- model,
69
- model_dir,
70
  device_map="auto", # Automatically map layers across GPUs
71
  offload_folder=offload_dir, # Offload parts to disk if needed
72
- max_memory={i: "20GiB" for i in range(torch.cuda.device_count())} # Adjust memory per GPU
 
73
  )
74
 
75
  # Load the tokenizer
@@ -156,8 +160,3 @@ def output_fn(prediction_output, accept, context=None):
156
  Convert the model output to a JSON response.
157
  """
158
  return json.dumps(prediction_output)
159
-
160
- # Memory tracker for debugging
161
- def track_memory():
162
- print(f"Total allocated memory on GPU 0: {torch.cuda.memory_allocated(0) / 1024 ** 3:.2f} GB")
163
- print(f"Total reserved memory on GPU 0: {torch.cuda.memory_reserved(0) / 1024 ** 3:.2f} GB")
 
7
  import os # For file operations
8
  import time # For sleep function
9
 
10
+ # Set the max_split_size globally at the start
11
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
12
+
13
+ # Print to verify the environment variable is correctly set
14
+ print(f"PYTORCH_CUDA_ALLOC_CONF: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
15
+
16
  # Global variables to persist the model and tokenizer between invocations
17
  model = None
18
  tokenizer = None
 
65
  offload_dir = "/tmp/offload_dir"
66
  os.makedirs(offload_dir, exist_ok=True)
67
 
68
+ # Load and dispatch model across 4 GPUs using tensor parallelism
 
 
 
69
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
70
  model = load_checkpoint_and_dispatch(
71
+ model,
72
+ model_dir,
73
  device_map="auto", # Automatically map layers across GPUs
74
  offload_folder=offload_dir, # Offload parts to disk if needed
75
+ max_memory={i: "20GiB" for i in range(4)}, # Adjust memory per GPU (4 GPUs)
76
+ no_split_module_classes=["QwenForCausalLM"] # Ensure model is split across the GPUs
77
  )
78
 
79
  # Load the tokenizer
 
160
  Convert the model output to a JSON response.
161
  """
162
  return json.dumps(prediction_output)