Marlon Wiprud commited on
Commit
5b8b5f4
·
1 Parent(s): 3740af9

sketch: split on gpus

Browse files
Files changed (1) hide show
  1. handler.py +37 -38
handler.py CHANGED
@@ -4,12 +4,11 @@ from PIL import Image
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
7
-
8
- # from accelerate import (
9
- # init_empty_weights,
10
- # infer_auto_device_map,
11
- # load_checkpoint_and_dispatch,
12
- # )
13
 
14
 
15
  class EndpointHandler:
@@ -26,44 +25,44 @@ class EndpointHandler:
26
 
27
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
28
 
29
- self.model = (
30
- AutoModelForCausalLM.from_pretrained(
31
- "THUDM/cogvlm-chat-hf",
32
- torch_dtype=torch.bfloat16,
33
- low_cpu_mem_usage=True,
34
- trust_remote_code=True,
35
- )
36
- .to("cuda")
37
- .eval()
38
- )
39
-
40
- # DISTRIBUTED GPUS
41
- # with init_empty_weights():
42
- # self.model = AutoModelForCausalLM.from_pretrained(
43
  # "THUDM/cogvlm-chat-hf",
44
  # torch_dtype=torch.bfloat16,
45
  # low_cpu_mem_usage=True,
46
  # trust_remote_code=True,
47
  # )
48
-
49
- # device_map = infer_auto_device_map(
50
- # self.model,
51
- # max_memory={
52
- # 0: "16GiB",
53
- # 1: "16GiB",
54
- # 2: "16GiB",
55
- # 3: "16GiB",
56
- # "cpu": "180GiB",
57
- # },
58
- # no_split_module_classes="CogVLMDecoderLayer",
59
- # )
60
- # self.model = load_checkpoint_and_dispatch(
61
- # self.model,
62
- # "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
63
- # device_map=device_map,
64
- # no_split_module_classes=["CogVLMDecoderLayer"],
65
  # )
66
- # self.model = self.model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ## DISTRIBUTED GPUS
68
 
69
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
 
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
7
+ from accelerate import (
8
+ init_empty_weights,
9
+ infer_auto_device_map,
10
+ load_checkpoint_and_dispatch,
11
+ )
 
12
 
13
 
14
  class EndpointHandler:
 
25
 
26
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
27
 
28
+ # self.model = (
29
+ # AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
30
  # "THUDM/cogvlm-chat-hf",
31
  # torch_dtype=torch.bfloat16,
32
  # low_cpu_mem_usage=True,
33
  # trust_remote_code=True,
34
  # )
35
+ # .to("cuda")
36
+ # .eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # )
38
+
39
+ # DISTRIBUTED GPUS
40
+ with init_empty_weights():
41
+ self.model = AutoModelForCausalLM.from_pretrained(
42
+ "THUDM/cogvlm-chat-hf",
43
+ torch_dtype=torch.bfloat16,
44
+ low_cpu_mem_usage=True,
45
+ trust_remote_code=True,
46
+ )
47
+
48
+ device_map = infer_auto_device_map(
49
+ self.model,
50
+ max_memory={
51
+ 0: "16GiB",
52
+ 1: "16GiB",
53
+ 2: "16GiB",
54
+ 3: "16GiB",
55
+ "cpu": "180GiB",
56
+ },
57
+ no_split_module_classes=["CogVLMDecoderLayer"],
58
+ )
59
+ self.model = load_checkpoint_and_dispatch(
60
+ self.model,
61
+ "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
62
+ device_map=device_map,
63
+ no_split_module_classes=["CogVLMDecoderLayer"],
64
+ )
65
+ self.model = self.model.eval()
66
  ## DISTRIBUTED GPUS
67
 
68
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: