Update code/inference.py
Browse files- code/inference.py +11 -12
code/inference.py
CHANGED
@@ -7,6 +7,12 @@ import fcntl # For file locking
|
|
7 |
import os # For file operations
|
8 |
import time # For sleep function
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Global variables to persist the model and tokenizer between invocations
|
11 |
model = None
|
12 |
tokenizer = None
|
@@ -59,17 +65,15 @@ def model_fn(model_dir, context=None):
|
|
59 |
offload_dir = "/tmp/offload_dir"
|
60 |
os.makedirs(offload_dir, exist_ok=True)
|
61 |
|
62 |
-
#
|
63 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
64 |
-
|
65 |
-
# Load and dispatch model across GPUs with tensor parallelism
|
66 |
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
|
67 |
model = load_checkpoint_and_dispatch(
|
68 |
-
model,
|
69 |
-
model_dir,
|
70 |
device_map="auto", # Automatically map layers across GPUs
|
71 |
offload_folder=offload_dir, # Offload parts to disk if needed
|
72 |
-
max_memory={i: "20GiB" for i in range(
|
|
|
73 |
)
|
74 |
|
75 |
# Load the tokenizer
|
@@ -156,8 +160,3 @@ def output_fn(prediction_output, accept, context=None):
|
|
156 |
Convert the model output to a JSON response.
|
157 |
"""
|
158 |
return json.dumps(prediction_output)
|
159 |
-
|
160 |
-
# Memory tracker for debugging
|
161 |
-
def track_memory():
|
162 |
-
print(f"Total allocated memory on GPU 0: {torch.cuda.memory_allocated(0) / 1024 ** 3:.2f} GB")
|
163 |
-
print(f"Total reserved memory on GPU 0: {torch.cuda.memory_reserved(0) / 1024 ** 3:.2f} GB")
|
|
|
7 |
import os # For file operations
|
8 |
import time # For sleep function
|
9 |
|
10 |
+
# Set the max_split_size globally at the start
|
11 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
12 |
+
|
13 |
+
# Print to verify the environment variable is correctly set
|
14 |
+
print(f"PYTORCH_CUDA_ALLOC_CONF: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
|
15 |
+
|
16 |
# Global variables to persist the model and tokenizer between invocations
|
17 |
model = None
|
18 |
tokenizer = None
|
|
|
65 |
offload_dir = "/tmp/offload_dir"
|
66 |
os.makedirs(offload_dir, exist_ok=True)
|
67 |
|
68 |
+
# Load and dispatch model across 4 GPUs using tensor parallelism
|
|
|
|
|
|
|
69 |
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
|
70 |
model = load_checkpoint_and_dispatch(
|
71 |
+
model,
|
72 |
+
model_dir,
|
73 |
device_map="auto", # Automatically map layers across GPUs
|
74 |
offload_folder=offload_dir, # Offload parts to disk if needed
|
75 |
+
max_memory={i: "20GiB" for i in range(4)}, # Adjust memory per GPU (4 GPUs)
|
76 |
+
no_split_module_classes=["QwenForCausalLM"] # Ensure model is split across the GPUs
|
77 |
)
|
78 |
|
79 |
# Load the tokenizer
|
|
|
160 |
Convert the model output to a JSON response.
|
161 |
"""
|
162 |
return json.dumps(prediction_output)
|
|
|
|
|
|
|
|
|
|