kajdun
/

iubaris-13b-v3_GPTQ

Text Generation

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

kajdun commited on Aug 24, 2023

Commit

3382124

·

1 Parent(s): a77b076

Update handler.py

Files changed (1) hide show

handler.py +11 -7

handler.py CHANGED Viewed

@@ -7,6 +7,8 @@ import torch
 #device = 0 if torch.cuda.is_available() else -1
 MAX_INPUT_TOKEN_LENGTH = 4000
 class EndpointHandler():
     def __init__(self, path=""):
@@ -15,23 +17,25 @@ class EndpointHandler():
     def get_input_token_length(message: str) -> int:
         input_ids = self.tokenizer([message], return_tensors='np', add_special_tokens=False)['input_ids']
-        return input_ids.shape[-1]
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
         input_token_length = get_input_token_length(inputs)
         if input_token_length > MAX_INPUT_TOKEN_LENGTH:
-            [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]
         #input_ids = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
         input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
-        if parameters is not None:
-            outputs = self.model.generate(**input_ids, **parameters)
-        else:
-            outputs = self.model.generate(**input_ids)
         prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

 #device = 0 if torch.cuda.is_available() else -1
 MAX_INPUT_TOKEN_LENGTH = 4000
+MAX_MAX_NEW_TOKENS=2048
+DEFAULT_MAX_NEW_TOKENS = 1024
 class EndpointHandler():
     def __init__(self, path=""):
     def get_input_token_length(message: str) -> int:
         input_ids = self.tokenizer([message], return_tensors='np', add_special_tokens=False)['input_ids']
+        return input_ids.shape[-1]
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", {})
+        parameters["max_new_tokens"] = parameters.pop("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
+        if parameters["max_new_tokens"] > MAX_MAX_NEW_TOKENS:
+            return [{"generated_text": None, "error": f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})"}]
         input_token_length = get_input_token_length(inputs)
         if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+            return [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]
         #input_ids = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
         input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
+        outputs = self.model.generate(**input_ids, **parameters)
         prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)