kajdun commited on
Commit
a1c6e67
·
1 Parent(s): 46814d3

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +13 -17
handler.py CHANGED
@@ -4,26 +4,17 @@ from transformers import AutoTokenizer, TextGenerationPipeline
4
  from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
 
6
  # check for GPU
7
- device = 0 if torch.cuda.is_available() else -1
8
 
9
  #print(f"cuda: {device}")
10
 
11
  class EndpointHandler():
12
- def __init__(self, path=""):
13
- quantize_config = BaseQuantizeConfig(**{
14
- "bits": 4,
15
- "group_size": 128,
16
- "damp_percent": 0.01,
17
- "desc_act": False,
18
- "static_groups": False,
19
- "sym": True,
20
- "true_sequential": True
21
- })
22
  # load the optimized model
23
- model = AutoGPTQForCausalLM.from_quantized(path, device="cuda:0", quantize_config=quantize_config, use_safetensors=True) #file_name="model-quantized.onnx")
24
- tokenizer = AutoTokenizer.from_pretrained(path)
25
  # or you can also use pipeline
26
- self.generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
27
 
28
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
29
  """
@@ -36,10 +27,15 @@ class EndpointHandler():
36
  inputs = data.pop("inputs", data)
37
  parameters = data.pop("parameters", None)
38
 
 
 
39
  # pass inputs with all kwargs in data
40
  if parameters is not None:
41
- prediction = self.generator(inputs, **parameters)
 
42
  else:
43
- prediction = self.generator(inputs)
44
 
45
- return prediction
 
 
 
4
  from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
 
6
  # check for GPU
7
+ #device = 0 if torch.cuda.is_available() else -1
8
 
9
  #print(f"cuda: {device}")
10
 
11
  class EndpointHandler():
12
+ def __init__(self, path=""):
 
 
 
 
 
 
 
 
 
13
  # load the optimized model
14
+ self.model = AutoGPTQForCausalLM.from_quantized(path, device_map="auto", use_safetensors=True) #file_name="model-quantized.onnx")
15
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
16
  # or you can also use pipeline
17
+ #self.generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
18
 
19
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
20
  """
 
27
  inputs = data.pop("inputs", data)
28
  parameters = data.pop("parameters", None)
29
 
30
+ input_ids = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
31
+
32
  # pass inputs with all kwargs in data
33
  if parameters is not None:
34
+ #prediction = self.generator(inputs, **parameters)
35
+ outputs = self.model.generate(**input_ids, **parameters)
36
  else:
37
+ outputs = self.model.generate(**input_ids)
38
 
39
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
40
+
41
+ return [{"generated_text": prediction}]