Imran1
/

Qwen2.5-72B-Instruct-FP8

Safetensors

qwen2

Model card Files Files and versions Community

Imran1 commited on 14 days ago

Commit

1d61c11

•

1 Parent(s): 73941c5

Update code/inference.py

Browse files

Files changed (1) hide show

code/inference.py +9 -25

code/inference.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import json
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
-import os
 # Global variables to persist the model and tokenizer between invocations
 model = None
@@ -27,33 +26,18 @@ def model_fn(model_dir, context=None):
     if model is None:  # Check if the model is already loaded
         print("Loading the model and tokenizer...")
         # Define an offload directory for any model components that can't fit in GPU memory
-        offload_dir = "/tmp/offload_dir"
-        os.makedirs(offload_dir, exist_ok=True)  # Ensure the directory exists and SageMaker has write access
-        # Explicitly map the model across 8 GPUs
-        device_map = {
-            "transformer.h.0": 0, "transformer.h.1": 0,
-            "transformer.h.2": 1, "transformer.h.3": 1,
-            "transformer.h.4": 2, "transformer.h.5": 2,
-            "transformer.h.6": 3, "transformer.h.7": 3,
-            "transformer.h.8": 4, "transformer.h.9": 4,
-            "transformer.h.10": 5, "transformer.h.11": 5,
-            "transformer.h.12": 6, "transformer.h.13": 6,
-            "transformer.h.14": 7, "transformer.h.15": 7,
-            "transformer.ln_f": 7, "lm_head": 7
-        }
-        # Load and dispatch the model across multiple GPUs with offloading
         model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
         model = load_checkpoint_and_dispatch(
-            model,
-            model_dir,
-            device_map=device_map,  # Explicitly map layers across 8 GPUs
             offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
         )
         # Load the tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_dir)
@@ -121,8 +105,8 @@ def input_fn(serialized_input_data, content_type, context=None):
     return serialized_input_data
 # Define output format for SageMaker
-def output_fn(prediction_output, accept, context=None):
     """
     Convert the model output to a JSON response.
     """
-    return json.dumps(prediction_output)

+ import json
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 # Global variables to persist the model and tokenizer between invocations
 model = None
     if model is None:  # Check if the model is already loaded
         print("Loading the model and tokenizer...")
         # Define an offload directory for any model components that can't fit in GPU memory
+        offload_dir = "/tmp/offload_dir"  # Ensure SageMaker has write access to this directory
+        # Load and dispatch the model across multiple GPUs
         model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
         model = load_checkpoint_and_dispatch(
+            model,
+            model_dir,
+            device_map="auto",  # Automatically map model layers across devices
             offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
         )
         # Load the tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_dir)
     return serialized_input_data
 # Define output format for SageMaker
+def output_fn(prediction_output, accept , context=None):
     """
     Convert the model output to a JSON response.
     """
+    return json.dumps(prediction_output)  you have my code