Imran1
/

Qwen2.5-72B-Instruct-FP8

Safetensors

qwen2

Model card Files Files and versions Community

Imran1 commited on 14 days ago

Commit

73941c5

•

1 Parent(s): 544c001

Update code/inference.py

Browse files

Files changed (1) hide show

code/inference.py +24 -8

code/inference.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 # Global variables to persist the model and tokenizer between invocations
 model = None
@@ -26,18 +27,33 @@ def model_fn(model_dir, context=None):
     if model is None:  # Check if the model is already loaded
         print("Loading the model and tokenizer...")
         # Define an offload directory for any model components that can't fit in GPU memory
-        offload_dir = "/tmp/offload_dir"  # Ensure SageMaker has write access to this directory
-        # Load and dispatch the model across multiple GPUs
         model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
         model = load_checkpoint_and_dispatch(
-            model,
-            model_dir,
-            device_map="auto",  # Automatically map model layers across devices
             offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
         )
         # Load the tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_dir)
@@ -98,14 +114,14 @@ def predict_fn(input_data, model_and_tokenizer):
         return {"error": str(e), "details": repr(e)}
 # Define input format for SageMaker
-def input_fn(serialized_input_data, content_type):
     """
     Prepare the input data for inference.
     """
     return serialized_input_data
 # Define output format for SageMaker
-def output_fn(prediction_output, accept):
     """
     Convert the model output to a JSON response.
     """

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
+import os
 # Global variables to persist the model and tokenizer between invocations
 model = None
     if model is None:  # Check if the model is already loaded
         print("Loading the model and tokenizer...")
         # Define an offload directory for any model components that can't fit in GPU memory
+        offload_dir = "/tmp/offload_dir"
+        os.makedirs(offload_dir, exist_ok=True)  # Ensure the directory exists and SageMaker has write access
+        # Explicitly map the model across 8 GPUs
+        device_map = {
+            "transformer.h.0": 0, "transformer.h.1": 0,
+            "transformer.h.2": 1, "transformer.h.3": 1,
+            "transformer.h.4": 2, "transformer.h.5": 2,
+            "transformer.h.6": 3, "transformer.h.7": 3,
+            "transformer.h.8": 4, "transformer.h.9": 4,
+            "transformer.h.10": 5, "transformer.h.11": 5,
+            "transformer.h.12": 6, "transformer.h.13": 6,
+            "transformer.h.14": 7, "transformer.h.15": 7,
+            "transformer.ln_f": 7, "lm_head": 7
+        }
+        # Load and dispatch the model across multiple GPUs with offloading
         model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
         model = load_checkpoint_and_dispatch(
+            model,
+            model_dir,
+            device_map=device_map,  # Explicitly map layers across 8 GPUs
             offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
         )
         # Load the tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_dir)
         return {"error": str(e), "details": repr(e)}
 # Define input format for SageMaker
+def input_fn(serialized_input_data, content_type, context=None):
     """
     Prepare the input data for inference.
     """
     return serialized_input_data
 # Define output format for SageMaker
+def output_fn(prediction_output, accept, context=None):
     """
     Convert the model output to a JSON response.
     """