Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions Community

Imran1 commited on 13 days ago

Commit

939342b

•

1 Parent(s): 51d3eef

Update code/inference.py

Files changed (1) hide show

code/inference.py +11 -12

code/inference.py CHANGED Viewed

@@ -7,6 +7,12 @@ import fcntl  # For file locking
 import os  # For file operations
 import time  # For sleep function
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
@@ -59,17 +65,15 @@ def model_fn(model_dir, context=None):
                 offload_dir = "/tmp/offload_dir"
                 os.makedirs(offload_dir, exist_ok=True)
-                # Reduce memory fragmentation by setting the max split size
-                os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
-                # Load and dispatch model across GPUs with tensor parallelism
                 model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
                 model = load_checkpoint_and_dispatch(
-                    model,
-                    model_dir,
                     device_map="auto",  # Automatically map layers across GPUs
                     offload_folder=offload_dir,  # Offload parts to disk if needed
-                    max_memory={i: "20GiB" for i in range(torch.cuda.device_count())}  # Adjust memory per GPU
                 )
                 # Load the tokenizer
@@ -156,8 +160,3 @@ def output_fn(prediction_output, accept, context=None):
     Convert the model output to a JSON response.
     """
     return json.dumps(prediction_output)
-# Memory tracker for debugging
-def track_memory():
-    print(f"Total allocated memory on GPU 0: {torch.cuda.memory_allocated(0) / 1024 ** 3:.2f} GB")
-    print(f"Total reserved memory on GPU 0: {torch.cuda.memory_reserved(0) / 1024 ** 3:.2f} GB")

 import os  # For file operations
 import time  # For sleep function
+# Set the max_split_size globally at the start
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+# Print to verify the environment variable is correctly set
+print(f"PYTORCH_CUDA_ALLOC_CONF: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
                 offload_dir = "/tmp/offload_dir"
                 os.makedirs(offload_dir, exist_ok=True)
+                # Load and dispatch model across 4 GPUs using tensor parallelism
                 model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
                 model = load_checkpoint_and_dispatch(
+                    model,
+                    model_dir,
                     device_map="auto",  # Automatically map layers across GPUs
                     offload_folder=offload_dir,  # Offload parts to disk if needed
+                    max_memory={i: "20GiB" for i in range(4)},  # Adjust memory per GPU (4 GPUs)
+                    no_split_module_classes=["QwenForCausalLM"]  # Ensure model is split across the GPUs
                 )
                 # Load the tokenizer
     Convert the model output to a JSON response.
     """
     return json.dumps(prediction_output)