Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions Community

Imran1 commited on 14 days ago

Commit

544c001

•

1 Parent(s): 13252ff

Update code/inference.py

Files changed (1) hide show

code/inference.py +24 -16

code/inference.py CHANGED Viewed

@@ -2,7 +2,11 @@ import json
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
-from accelerate import infer_auto_device_map, dispatch_model, load_checkpoint_and_dispatch
 # Function to format chat messages using Qwen's chat template
 def format_chat(messages: List[Dict[str, str]], tokenizer) -> str:
@@ -16,25 +20,29 @@ def model_fn(model_dir, context=None):
     """
     Load the model and tokenizer from the model directory for inference.
     Supports tensor parallelism across multiple GPUs with offloading.
     """
-    # Define an offload directory for any model components that can't fit in GPU memory
-    offload_dir = "/tmp/offload_dir"  # Ensure SageMaker has write access to this directory
-    # Use `Accelerate` to load and dispatch the model across GPUs
-    model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
-    # Distribute the model across multiple GPUs
-    model = load_checkpoint_and_dispatch(
-        model,
-        model_dir,
-        device_map="auto",  # Automatically map model layers across devices
-        offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
-    )
-    # Load the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_dir)
     return model, tokenizer
 # Custom predict function for SageMaker
 def predict_fn(input_data, model_and_tokenizer):
     """

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
+from accelerate import load_checkpoint_and_dispatch
+# Global variables to persist the model and tokenizer between invocations
+model = None
+tokenizer = None
 # Function to format chat messages using Qwen's chat template
 def format_chat(messages: List[Dict[str, str]], tokenizer) -> str:
     """
     Load the model and tokenizer from the model directory for inference.
     Supports tensor parallelism across multiple GPUs with offloading.
+    The model is loaded only once and stored in a global variable.
     """
+    global model, tokenizer  # Declare model and tokenizer as global to persist across invocations
+    if model is None:  # Check if the model is already loaded
+        print("Loading the model and tokenizer...")
+        # Define an offload directory for any model components that can't fit in GPU memory
+        offload_dir = "/tmp/offload_dir"  # Ensure SageMaker has write access to this directory
+        # Load and dispatch the model across multiple GPUs
+        model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
+        model = load_checkpoint_and_dispatch(
+            model,
+            model_dir,
+            device_map="auto",  # Automatically map model layers across devices
+            offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
+        )
+        # Load the tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_dir)
     return model, tokenizer
 # Custom predict function for SageMaker
 def predict_fn(input_data, model_and_tokenizer):
     """