Imran1 commited on
Commit
73941c5
1 Parent(s): 544c001

Update code/inference.py

Browse files
Files changed (1) hide show
  1. code/inference.py +24 -8
code/inference.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from typing import List, Dict
5
  from accelerate import load_checkpoint_and_dispatch
 
6
 
7
  # Global variables to persist the model and tokenizer between invocations
8
  model = None
@@ -26,18 +27,33 @@ def model_fn(model_dir, context=None):
26
 
27
  if model is None: # Check if the model is already loaded
28
  print("Loading the model and tokenizer...")
 
29
  # Define an offload directory for any model components that can't fit in GPU memory
30
- offload_dir = "/tmp/offload_dir" # Ensure SageMaker has write access to this directory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Load and dispatch the model across multiple GPUs
33
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
34
  model = load_checkpoint_and_dispatch(
35
- model,
36
- model_dir,
37
- device_map="auto", # Automatically map model layers across devices
38
  offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
39
  )
40
-
41
  # Load the tokenizer
42
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
43
 
@@ -98,14 +114,14 @@ def predict_fn(input_data, model_and_tokenizer):
98
  return {"error": str(e), "details": repr(e)}
99
 
100
  # Define input format for SageMaker
101
- def input_fn(serialized_input_data, content_type):
102
  """
103
  Prepare the input data for inference.
104
  """
105
  return serialized_input_data
106
 
107
  # Define output format for SageMaker
108
- def output_fn(prediction_output, accept):
109
  """
110
  Convert the model output to a JSON response.
111
  """
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from typing import List, Dict
5
  from accelerate import load_checkpoint_and_dispatch
6
+ import os
7
 
8
  # Global variables to persist the model and tokenizer between invocations
9
  model = None
 
27
 
28
  if model is None: # Check if the model is already loaded
29
  print("Loading the model and tokenizer...")
30
+
31
  # Define an offload directory for any model components that can't fit in GPU memory
32
+ offload_dir = "/tmp/offload_dir"
33
+ os.makedirs(offload_dir, exist_ok=True) # Ensure the directory exists and SageMaker has write access
34
+
35
+ # Explicitly map the model across 8 GPUs
36
+ device_map = {
37
+ "transformer.h.0": 0, "transformer.h.1": 0,
38
+ "transformer.h.2": 1, "transformer.h.3": 1,
39
+ "transformer.h.4": 2, "transformer.h.5": 2,
40
+ "transformer.h.6": 3, "transformer.h.7": 3,
41
+ "transformer.h.8": 4, "transformer.h.9": 4,
42
+ "transformer.h.10": 5, "transformer.h.11": 5,
43
+ "transformer.h.12": 6, "transformer.h.13": 6,
44
+ "transformer.h.14": 7, "transformer.h.15": 7,
45
+ "transformer.ln_f": 7, "lm_head": 7
46
+ }
47
 
48
+ # Load and dispatch the model across multiple GPUs with offloading
49
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
50
  model = load_checkpoint_and_dispatch(
51
+ model,
52
+ model_dir,
53
+ device_map=device_map, # Explicitly map layers across 8 GPUs
54
  offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
55
  )
56
+
57
  # Load the tokenizer
58
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
59
 
 
114
  return {"error": str(e), "details": repr(e)}
115
 
116
  # Define input format for SageMaker
117
+ def input_fn(serialized_input_data, content_type, context=None):
118
  """
119
  Prepare the input data for inference.
120
  """
121
  return serialized_input_data
122
 
123
  # Define output format for SageMaker
124
+ def output_fn(prediction_output, accept, context=None):
125
  """
126
  Convert the model output to a JSON response.
127
  """