Update code/inference.py
Browse files- code/inference.py +24 -8
code/inference.py
CHANGED
@@ -3,6 +3,7 @@ import torch
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
from typing import List, Dict
|
5 |
from accelerate import load_checkpoint_and_dispatch
|
|
|
6 |
|
7 |
# Global variables to persist the model and tokenizer between invocations
|
8 |
model = None
|
@@ -26,18 +27,33 @@ def model_fn(model_dir, context=None):
|
|
26 |
|
27 |
if model is None: # Check if the model is already loaded
|
28 |
print("Loading the model and tokenizer...")
|
|
|
29 |
# Define an offload directory for any model components that can't fit in GPU memory
|
30 |
-
offload_dir = "/tmp/offload_dir"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
# Load and dispatch the model across multiple GPUs
|
33 |
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
|
34 |
model = load_checkpoint_and_dispatch(
|
35 |
-
model,
|
36 |
-
model_dir,
|
37 |
-
device_map=
|
38 |
offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
|
39 |
)
|
40 |
-
|
41 |
# Load the tokenizer
|
42 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
43 |
|
@@ -98,14 +114,14 @@ def predict_fn(input_data, model_and_tokenizer):
|
|
98 |
return {"error": str(e), "details": repr(e)}
|
99 |
|
100 |
# Define input format for SageMaker
|
101 |
-
def input_fn(serialized_input_data, content_type):
|
102 |
"""
|
103 |
Prepare the input data for inference.
|
104 |
"""
|
105 |
return serialized_input_data
|
106 |
|
107 |
# Define output format for SageMaker
|
108 |
-
def output_fn(prediction_output, accept):
|
109 |
"""
|
110 |
Convert the model output to a JSON response.
|
111 |
"""
|
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
from typing import List, Dict
|
5 |
from accelerate import load_checkpoint_and_dispatch
|
6 |
+
import os
|
7 |
|
8 |
# Global variables to persist the model and tokenizer between invocations
|
9 |
model = None
|
|
|
27 |
|
28 |
if model is None: # Check if the model is already loaded
|
29 |
print("Loading the model and tokenizer...")
|
30 |
+
|
31 |
# Define an offload directory for any model components that can't fit in GPU memory
|
32 |
+
offload_dir = "/tmp/offload_dir"
|
33 |
+
os.makedirs(offload_dir, exist_ok=True) # Ensure the directory exists and SageMaker has write access
|
34 |
+
|
35 |
+
# Explicitly map the model across 8 GPUs
|
36 |
+
device_map = {
|
37 |
+
"transformer.h.0": 0, "transformer.h.1": 0,
|
38 |
+
"transformer.h.2": 1, "transformer.h.3": 1,
|
39 |
+
"transformer.h.4": 2, "transformer.h.5": 2,
|
40 |
+
"transformer.h.6": 3, "transformer.h.7": 3,
|
41 |
+
"transformer.h.8": 4, "transformer.h.9": 4,
|
42 |
+
"transformer.h.10": 5, "transformer.h.11": 5,
|
43 |
+
"transformer.h.12": 6, "transformer.h.13": 6,
|
44 |
+
"transformer.h.14": 7, "transformer.h.15": 7,
|
45 |
+
"transformer.ln_f": 7, "lm_head": 7
|
46 |
+
}
|
47 |
|
48 |
+
# Load and dispatch the model across multiple GPUs with offloading
|
49 |
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
|
50 |
model = load_checkpoint_and_dispatch(
|
51 |
+
model,
|
52 |
+
model_dir,
|
53 |
+
device_map=device_map, # Explicitly map layers across 8 GPUs
|
54 |
offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
|
55 |
)
|
56 |
+
|
57 |
# Load the tokenizer
|
58 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
59 |
|
|
|
114 |
return {"error": str(e), "details": repr(e)}
|
115 |
|
116 |
# Define input format for SageMaker
|
117 |
+
def input_fn(serialized_input_data, content_type, context=None):
|
118 |
"""
|
119 |
Prepare the input data for inference.
|
120 |
"""
|
121 |
return serialized_input_data
|
122 |
|
123 |
# Define output format for SageMaker
|
124 |
+
def output_fn(prediction_output, accept, context=None):
|
125 |
"""
|
126 |
Convert the model output to a JSON response.
|
127 |
"""
|