scepter commited on
Commit
40e34b6
1 Parent(s): 2356f5e

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +64 -0
handler.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
+ import torch
4
+
5
+ class EndpointHandler():
6
+ def __init__(self, path=""):
7
+ #quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
8
+
9
+ # device_map = {
10
+ # "transformer.word_embeddings": 0,
11
+ # "transformer.word_embeddings_layernorm": 0,
12
+ # "lm_head": "cpu",
13
+ # "transformer.h": 0,
14
+ # "transformer.ln_f": 0,
15
+ # }
16
+ #path = "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g"
17
+
18
+ self.model = AutoModelForCausalLM.from_pretrained(
19
+ path,
20
+ device_map="auto",
21
+ load_in_8bit=True,
22
+ #kwargs="--wbits 4 --groupsize 128",
23
+ #device_map=device_map,
24
+ #quantization_config=quantization_config
25
+ )
26
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
27
+ self.pipeline = pipeline("conversational", model = self.model, tokenizer=self.tokenizer, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)
28
+
29
+ #rep= "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g"
30
+ # tokenizer = AutoTokenizer.from_pretrained(rep)
31
+ #model = AutoModelForCausalLM.from_pretrained(rep)
32
+
33
+ # inputs = tokenizer(["Today is"], return_tensors="pt")
34
+
35
+ # reply_ids = model.generate(**inputs, max_new_tokens=590) # return_dict_in_generate=True, output_scores=True
36
+ # outputs = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]
37
+ # print(outputs)
38
+
39
+ #modelPath = "/"
40
+
41
+ #self.pipeline = pipeline("conversational", model=modelPath)
42
+
43
+ # Preload all the elements you are going to need at inference.
44
+ # pseudo:
45
+ # self.model= load_model(path)
46
+ print("end")
47
+
48
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
49
+ inputs = data.pop("inputs", data)
50
+ parameters = data.pop("parameters", None)
51
+
52
+ # preprocess
53
+ input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
54
+
55
+ # pass inputs with all kwargs in data
56
+ if parameters is not None:
57
+ outputs = self.model.generate(input_ids, **parameters)
58
+ else:
59
+ outputs = self.model.generate(input_ids)
60
+
61
+ # postprocess the prediction
62
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
63
+
64
+ return [{"generated_text": prediction}]