kajdun
/

iubaris-13b-v3_GPTQ

Text Generation

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

kajdun commited on Aug 21, 2023

Commit

023c0f7

·

1 Parent(s): e56b9b2

Update handler.py

Files changed (1) hide show

handler.py +8 -6

handler.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import torch
 from typing import  Dict, List, Any
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, TextGenerationPipeline
 from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 # check for GPU
 device = 0 if torch.cuda.is_available() else -1
 class EndpointHandler():
     def __init__(self, path=""):
         # load the optimized model
-        model = AutoGPTQForCausalLM.from_quantized(path, use_safetensors=True) #file_name="model-quantized.onnx")
         tokenizer = AutoTokenizer.from_pretrained(path)
         # or you can also use pipeline
-        self.pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         """
         Args:
@@ -27,8 +29,8 @@ class EndpointHandler():
         # pass inputs with all kwargs in data
         if parameters is not None:
-            prediction = self.pipeline(inputs, **parameters)
         else:
-            prediction = self.pipeline(inputs)
         return prediction

 import torch
 from typing import  Dict, List, Any
+from transformers import pipeline, AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, TextGenerationPipeline
 from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 # check for GPU
 device = 0 if torch.cuda.is_available() else -1
+print(f"cuda: {device}")
 class EndpointHandler():
     def __init__(self, path=""):
         # load the optimized model
+        model = AutoGPTQForCausalLM.from_quantized(path, use_safetensors=False, low_cpu_mem_usage=True) #file_name="model-quantized.onnx")
         tokenizer = AutoTokenizer.from_pretrained(path)
         # or you can also use pipeline
+        #self.pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+        self.generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         """
         Args:
         # pass inputs with all kwargs in data
         if parameters is not None:
+            prediction = self.generator(inputs, **parameters)
         else:
+            prediction = self.generator(inputs)
         return prediction