File size: 4,550 Bytes

901aaed

from transformers import Pipeline
import torch.nn.functional as F
import torch
from ts.torch_handler.base_handler import BaseHandler
import logging
import os
import transformers
from transformers import AutoTokenizer
logger = logging.getLogger(__name__)
logger.info("Transformers version %s", transformers.__version__)
from optimum.onnxruntime import ORTModelForFeatureExtraction

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
class SentenceEmbeddingHandler(BaseHandler):
    def __init__(self):
        super(SentenceEmbeddingHandler, self).__init__()
        self._context = None
        self.initialized = False
class SentenceEmbeddingPipeline(Pipeline):
    def initialize(self, context):
        """
        Initialize function loads the model and the tokenizer

        Args:
            context (context): It is a JSON Object containing information
            pertaining to the model artifacts parameters.

        Raises:
            RuntimeError: Raises the Runtime error when the model or
            tokenizer is missing
        """

        properties = context.system_properties
        self.manifest = context.manifest
        model_dir = properties.get("model_dir")

        # use GPU if available
        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id"))
            if torch.cuda.is_available() and properties.get("gpu_id") is not None
            else "cpu"
        )
        logger.info(f'Using device {self.device}')

        # load the model
        model_file = self.manifest['model']['modelFile']
        model_path = os.path.join(model_dir, model_file)

        if os.path.isfile(model_path):
            # self.model = AutoModel.from_pretrained(model_dir)
            self.model = ORTModelForFeatureExtraction.from_pretrained(model_dir, file_name="model_optimized.onnx")
            self.model.to(self.device)
    
            logger.info(f'Successfully loaded model from {model_file}')
        else:
            raise RuntimeError('Missing the model file')

        # load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        if self.tokenizer is not None:
            logger.info('Successfully loaded tokenizer')
        else:
            raise RuntimeError('Missing tokenizer')

        self.initialized = True
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}
 
    def preprocess_text(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs
    
    def preprocess(self, requests):
        """
        Tokenize the input text using the suitable tokenizer and convert 
        it to tensor

        If token_ids is provided, the json must be of the form
        {'input_ids': [[101, 102]], 'token_type_ids': [[0, 0]], 'attention_mask': [[1, 1]]}

        Args:
            requests: A list containing a dictionary, might be in the form
            of [{'body': json_file}] or [{'data': json_file}] or [{'token_ids': json_file}]
        Returns:
            the tensor containing the batch of token vectors.
        """

        # unpack the data
        data = requests[0].get('body')
        if data is None:
            data = requests[0].get('data')
        
        texts = data.get('input')
        if texts is not None:
            logger.info('Text provided')
            return self.preprocess_text(texts)

        encodings = data.get('encodings')
        if encodings is not None:
            logger.info('Encodings provided')
            return transformers.BatchEncoding(data={k: torch.tensor(v) for k, v in encodings.items()})

        raise Exception("unsupported payload")
    def inference(self, model_inputs):
        outputs = self.model(**model_inputs)
        sentence_embeddings = mean_pooling(outputs, model_inputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return  sentence_embeddings
 
    def postprocess(self, outputs):
        formatted_outputs = []
        data=[outputs.tolist()]
        for dat in data:
            formatted_outputs.append({"status":"success","data":dat})
        return formatted_outputs