Imran1
/

Qwen2.5-72B-Instruct-FP8

Safetensors

qwen2

Model card Files Files and versions Community

Imran1 commited on 14 days ago

Commit

f74b603

•

1 Parent(s): 1d61c11

Update code/inference.py

Browse files

Files changed (1) hide show

code/inference.py +40 -39

code/inference.py CHANGED Viewed

@@ -3,51 +3,50 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
 # Function to format chat messages using Qwen's chat template
 def format_chat(messages: List[Dict[str, str]], tokenizer) -> str:
-    """
-    Format chat messages using Qwen's chat template.
-    """
     return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-# Model loading function for SageMaker with tensor parallelism and offloading
 def model_fn(model_dir, context=None):
-    """
-    Load the model and tokenizer from the model directory for inference.
-    Supports tensor parallelism across multiple GPUs with offloading.
-    The model is loaded only once and stored in a global variable.
-    """
-    global model, tokenizer  # Declare model and tokenizer as global to persist across invocations
-    if model is None:  # Check if the model is already loaded
-        print("Loading the model and tokenizer...")
-        # Define an offload directory for any model components that can't fit in GPU memory
-        offload_dir = "/tmp/offload_dir"  # Ensure SageMaker has write access to this directory
-        # Load and dispatch the model across multiple GPUs
-        model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
         model = load_checkpoint_and_dispatch(
             model,
             model_dir,
-            device_map="auto",  # Automatically map model layers across devices
-            offload_folder=offload_dir,  # Offload parts of the model to disk if GPU memory is insufficient
         )
-        # Load the tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(model_dir)
     return model, tokenizer
 # Custom predict function for SageMaker
 def predict_fn(input_data, model_and_tokenizer):
-    """
-    Generate predictions for the input data.
-    """
     try:
         model, tokenizer = model_and_tokenizer
         data = json.loads(input_data)
@@ -57,27 +56,28 @@ def predict_fn(input_data, model_and_tokenizer):
         formatted_prompt = format_chat(messages, tokenizer)
         # Tokenize the input
-        inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda:0")  # Send input to GPU 0 for generation
         # Generate output
-        outputs = model.generate(
-            inputs['input_ids'],
-            max_new_tokens=data.get("max_new_tokens", 512),
-            temperature=data.get("temperature", 0.7),
-            top_p=data.get("top_p", 0.9),
-            repetition_penalty=data.get("repetition_penalty", 1.0),
-            length_penalty=data.get("length_penalty", 1.0),
-            do_sample=True
-        )
         # Decode the output
         generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         # Build response
         response = {
-            "id": "chatcmpl-uuid",
             "object": "chat.completion",
-            "model": "qwen-72b",
             "choices": [{
                 "index": 0,
                 "message": {
@@ -88,8 +88,8 @@ def predict_fn(input_data, model_and_tokenizer):
             }],
             "usage": {
                 "prompt_tokens": len(inputs['input_ids'][0]),
-                "completion_tokens": len(outputs[0]),
-                "total_tokens": len(inputs['input_ids'][0]) + len(outputs[0])
             }
         }
         return response
@@ -97,6 +97,7 @@ def predict_fn(input_data, model_and_tokenizer):
     except Exception as e:
         return {"error": str(e), "details": repr(e)}
 # Define input format for SageMaker
 def input_fn(serialized_input_data, content_type, context=None):
     """

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
 # Function to format chat messages using Qwen's chat template
 def format_chat(messages: List[Dict[str, str]], tokenizer) -> str:
     return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+# Model loading function for SageMaker with tensor parallelism and FP8 quantization
 def model_fn(model_dir, context=None):
+    global model, tokenizer
+    if model is None:
+        print("Loading the FP8 quantized model and tokenizer...")
+        # Define an offload directory
+        offload_dir = "/tmp/offload_dir"
+        os.makedirs(offload_dir, exist_ok=True)
+        # Load the tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_dir)
+        # Load the FP8 quantized model
+        model = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            torch_dtype=torch.float8,  # Specify FP8 dtype
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            offload_folder=offload_dir,
+        )
+        # Use load_checkpoint_and_dispatch for tensor parallelism
         model = load_checkpoint_and_dispatch(
             model,
             model_dir,
+            device_map="auto",
+            offload_folder=offload_dir,
+            no_split_module_classes=["QWenLMHeadModel"],  # Adjust if needed for Qwen architecture
         )
     return model, tokenizer
 # Custom predict function for SageMaker
 def predict_fn(input_data, model_and_tokenizer):
     try:
         model, tokenizer = model_and_tokenizer
         data = json.loads(input_data)
         formatted_prompt = format_chat(messages, tokenizer)
         # Tokenize the input
+        inputs = tokenizer([formatted_prompt], return_tensors="pt").to(model.device)
         # Generate output
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs['input_ids'],
+                max_new_tokens=data.get("max_new_tokens", 512),
+                temperature=data.get("temperature", 0.7),
+                top_p=data.get("top_p", 0.9),
+                repetition_penalty=data.get("repetition_penalty", 1.0),
+                length_penalty=data.get("length_penalty", 1.0),
+                do_sample=True
+            )
         # Decode the output
         generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         # Build response
         response = {
+            "id": "chatcmpl-fp8-quantized",
             "object": "chat.completion",
+            "model": "qwen-72b-fp8",
             "choices": [{
                 "index": 0,
                 "message": {
             }],
             "usage": {
                 "prompt_tokens": len(inputs['input_ids'][0]),
+                "completion_tokens": len(outputs[0]) - len(inputs['input_ids'][0]),
+                "total_tokens": len(outputs[0])
             }
         }
         return response
     except Exception as e:
         return {"error": str(e), "details": repr(e)}
 # Define input format for SageMaker
 def input_fn(serialized_input_data, content_type, context=None):
     """