Update handler.py
Browse files- handler.py +3 -5
handler.py
CHANGED
@@ -4,11 +4,6 @@ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
|
4 |
import torch
|
5 |
from loguru import logger
|
6 |
|
7 |
-
# check for GPU
|
8 |
-
device = 0 if torch.cuda.is_available() else -1
|
9 |
-
|
10 |
-
logger.info(f"cuda: {device}")
|
11 |
-
|
12 |
MAX_INPUT_TOKEN_LENGTH = 4000
|
13 |
MAX_MAX_NEW_TOKENS = 2048
|
14 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
@@ -29,12 +24,15 @@ class EndpointHandler():
|
|
29 |
parameters["max_new_tokens"] = parameters.pop("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
|
30 |
|
31 |
if parameters["max_new_tokens"] > MAX_MAX_NEW_TOKENS:
|
|
|
32 |
return [{"generated_text": None, "error": f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})"}]
|
33 |
|
34 |
input_token_length = self.get_input_token_length(inputs)
|
35 |
if input_token_length > MAX_INPUT_TOKEN_LENGTH:
|
|
|
36 |
return [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]
|
37 |
|
|
|
38 |
input_ids = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
|
39 |
|
40 |
outputs = self.model.generate(**input_ids, **parameters)
|
|
|
4 |
import torch
|
5 |
from loguru import logger
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
MAX_INPUT_TOKEN_LENGTH = 4000
|
8 |
MAX_MAX_NEW_TOKENS = 2048
|
9 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
|
|
24 |
parameters["max_new_tokens"] = parameters.pop("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
|
25 |
|
26 |
if parameters["max_new_tokens"] > MAX_MAX_NEW_TOKENS:
|
27 |
+
logger.error(f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})")
|
28 |
return [{"generated_text": None, "error": f"requested max_new_tokens too high (> {MAX_MAX_NEW_TOKENS})"}]
|
29 |
|
30 |
input_token_length = self.get_input_token_length(inputs)
|
31 |
if input_token_length > MAX_INPUT_TOKEN_LENGTH:
|
32 |
+
logger.error(f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})")
|
33 |
return [{"generated_text": None, "error": f"input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH})"}]
|
34 |
|
35 |
+
logger.info(f"inputs: {inputs}")
|
36 |
input_ids = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
|
37 |
|
38 |
outputs = self.model.generate(**input_ids, **parameters)
|