kajdun
/

iubaris-13b-v3_GPTQ

Text Generation

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

kajdun commited on Aug 24, 2023

Commit

2e55845

·

1 Parent(s): 3382124

Update handler.py

Files changed (1) hide show

handler.py +4 -4

handler.py CHANGED Viewed

@@ -6,9 +6,9 @@ import torch
 # check for GPU
 #device = 0 if torch.cuda.is_available() else -1
-MAX_INPUT_TOKEN_LENGTH = 4000
-MAX_MAX_NEW_TOKENS=2048
-DEFAULT_MAX_NEW_TOKENS = 1024
 class EndpointHandler():
     def __init__(self, path=""):
@@ -28,7 +28,7 @@ class EndpointHandler():
         if parameters["max_new_tokens"] > MAX_MAX_NEW_TOKENS:
             return [{"generated_text": None, "error": f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})"}]
-        input_token_length = get_input_token_length(inputs)
         if input_token_length > MAX_INPUT_TOKEN_LENGTH:
             return [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]

 # check for GPU
 #device = 0 if torch.cuda.is_available() else -1
+MAX_INPUT_TOKEN_LENGTH  = 4000
+MAX_MAX_NEW_TOKENS      = 2048
+DEFAULT_MAX_NEW_TOKENS  = 1024
 class EndpointHandler():
     def __init__(self, path=""):
         if parameters["max_new_tokens"] > MAX_MAX_NEW_TOKENS:
             return [{"generated_text": None, "error": f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})"}]
+        input_token_length = self.get_input_token_length(inputs)
         if input_token_length > MAX_INPUT_TOKEN_LENGTH:
             return [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]