Imran1 commited on
Commit
1d61c11
1 Parent(s): 73941c5

Update code/inference.py

Browse files
Files changed (1) hide show
  1. code/inference.py +9 -25
code/inference.py CHANGED
@@ -1,9 +1,8 @@
1
- import json
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from typing import List, Dict
5
  from accelerate import load_checkpoint_and_dispatch
6
- import os
7
 
8
  # Global variables to persist the model and tokenizer between invocations
9
  model = None
@@ -27,33 +26,18 @@ def model_fn(model_dir, context=None):
27
 
28
  if model is None: # Check if the model is already loaded
29
  print("Loading the model and tokenizer...")
30
-
31
  # Define an offload directory for any model components that can't fit in GPU memory
32
- offload_dir = "/tmp/offload_dir"
33
- os.makedirs(offload_dir, exist_ok=True) # Ensure the directory exists and SageMaker has write access
34
-
35
- # Explicitly map the model across 8 GPUs
36
- device_map = {
37
- "transformer.h.0": 0, "transformer.h.1": 0,
38
- "transformer.h.2": 1, "transformer.h.3": 1,
39
- "transformer.h.4": 2, "transformer.h.5": 2,
40
- "transformer.h.6": 3, "transformer.h.7": 3,
41
- "transformer.h.8": 4, "transformer.h.9": 4,
42
- "transformer.h.10": 5, "transformer.h.11": 5,
43
- "transformer.h.12": 6, "transformer.h.13": 6,
44
- "transformer.h.14": 7, "transformer.h.15": 7,
45
- "transformer.ln_f": 7, "lm_head": 7
46
- }
47
 
48
- # Load and dispatch the model across multiple GPUs with offloading
49
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
50
  model = load_checkpoint_and_dispatch(
51
- model,
52
- model_dir,
53
- device_map=device_map, # Explicitly map layers across 8 GPUs
54
  offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
55
  )
56
-
57
  # Load the tokenizer
58
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
59
 
@@ -121,8 +105,8 @@ def input_fn(serialized_input_data, content_type, context=None):
121
  return serialized_input_data
122
 
123
  # Define output format for SageMaker
124
- def output_fn(prediction_output, accept, context=None):
125
  """
126
  Convert the model output to a JSON response.
127
  """
128
- return json.dumps(prediction_output)
 
1
+ import json
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from typing import List, Dict
5
  from accelerate import load_checkpoint_and_dispatch
 
6
 
7
  # Global variables to persist the model and tokenizer between invocations
8
  model = None
 
26
 
27
  if model is None: # Check if the model is already loaded
28
  print("Loading the model and tokenizer...")
 
29
  # Define an offload directory for any model components that can't fit in GPU memory
30
+ offload_dir = "/tmp/offload_dir" # Ensure SageMaker has write access to this directory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Load and dispatch the model across multiple GPUs
33
  model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
34
  model = load_checkpoint_and_dispatch(
35
+ model,
36
+ model_dir,
37
+ device_map="auto", # Automatically map model layers across devices
38
  offload_folder=offload_dir, # Offload parts of the model to disk if GPU memory is insufficient
39
  )
40
+
41
  # Load the tokenizer
42
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
43
 
 
105
  return serialized_input_data
106
 
107
  # Define output format for SageMaker
108
+ def output_fn(prediction_output, accept , context=None):
109
  """
110
  Convert the model output to a JSON response.
111
  """
112
+ return json.dumps(prediction_output) you have my code