liaad
/

srl-en_roberta-large_hf

+import logging
+from typing import Any, Dict, List, Tuple
+import spacy
+import torch
+from transformers import Pipeline
+from decoder import Decoder
+logger = logging.getLogger(__name__)
+class SrlPipeline(Pipeline):
+    """
+    A pipeline for Semantic Role Labeling (SRL) using transformers and spaCy models.
+    This pipeline tokenizes input sentences, finds verbs using POS tagging, and postprocesses
+    the model outputs using Viterbi decoding to provide human-readable results.
+    Attributes:
+        model ``str``: The name or identifier of the underlying transformer model.
+        tokenizer ``str``: The name or identifier of the tokenizer associated with the model.
+        framework ``str``: The framework used for the pipeline (e.g., PyTorch, TensorFlow).
+        task ``str``: The specific task of the pipeline.
+        verb_predictor: An instance of spaCy model used for predicting verbs in the input sentences.
+    Usage:
+        # Register the SrlPipeline in the pipeline registry
+        PIPELINE_REGISTRY.register_pipeline(
+            "srl",
+            pipeline_class=SrlPipeline,
+            model=SRLModel,  # Assuming SRLModel is the model class used
+            default={"lang": "en"},
+            type="text",
+        )
+        # Load the model and tokenizer
+        model = AutoModel.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True)
+        # Load the SRL pipeline
+        srl_pipeline = pipeline(
+            "srl",
+            model=model,
+            tokenizer=tokenizer,
+            framework="PyTorch",  # Replace with actual framework used
+            task="semantic_role_labeling",  # Replace with actual task name
+            lang="en"  # Language specification
+        )
+        # Example text input
+        text = ["The cat jumps over the fence.", "She quickly eats the delicious cake."]
+        # Perform semantic role labeling
+        results = srl_pipeline(text)
+    """
+    def __init__(self, model: str, tokenizer: str, framework: str, task: str, **kwargs):
+        """
+        Initializes the Semantic Role Labeling pipeline.
+        Parameters:
+        - model ``str``: The model name or identifier.
+        - tokenizer ``str``: The tokenizer name or identifier.
+        - framework ``str``: The framework used.
+        - task ``str``: The specific task of the pipeline.
+        - **kwargs: Additional keyword arguments.
+                - lang ``str``, optional: Language specification ('en' for English or 'pt' for Portuguese, which is default).
+        """
+        super().__init__(model, tokenizer=tokenizer)
+        if "lang" in kwargs and kwargs["lang"] == "en":
+            logger.info("Loading English verb predictor model...")
+            self.verb_predictor = spacy.load("en_core_web_trf")
+        else:
+            logger.info("Loading Portuguese verb predictor model...")
+            self.verb_predictor = spacy.load("pt_core_news_lg")
+        logger.info("Got verb prediction model\n")
+    def _sanitize_parameters(
+        self, **kwargs: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
+        """
+        Sanitizes and organizes additional parameters.
+        Parameters:
+        - **kwargs: Additional keyword arguments.
+        Returns:
+        - ``Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]``: Three dictionaries of sanitized parameters for preprocess, _forward, and postprocess.
+        """
+        return {}, {}, {}
+    def preprocess(self, sentence: str) -> List[Dict[str, Any]]:
+        """
+        Preprocesses a sentence for semantic role labeling.
+        Parameters:
+        - sentence ``str``: The input sentence to be processed.
+        Returns:
+        - ``List[Dict[str, Any]]``: A list of dictionaries containing model inputs for each verb in the sentence.
+        """
+        # Extract sentence verbs
+        doc = self.verb_predictor(sentence)
+        verbs = {token.text for token in doc if token.pos_ == "VERB"}
+        # If the sentence only contains auxiliary verbs, consider those as the
+        # main verbs
+        if not verbs:
+            verbs = {token.text for token in doc if token.pos_ == "AUX"}
+        # Tokenize sentence
+        tokens = self.tokenizer.encode_plus(
+            sentence,
+            truncation=True,
+            return_token_type_ids=False,
+            return_offsets_mapping=True,
+        )
+        tokens_lst = tokens.tokens()
+        offsets = tokens["offset_mapping"]
+        input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long)
+        attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long)
+        model_input = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": [],
+            "tokens": tokens_lst,
+            "verb": "",
+        }
+        model_inputs = [
+            {**model_input} for _ in verbs
+        ]  # Create a new dictionary for each verb
+        for i, verb in enumerate(verbs):
+            model_inputs[i]["verb"] = verb
+            token_type_ids = model_inputs[i]["token_type_ids"]
+            token_type_ids.append([])
+            curr_word_offsets: tuple[int, int] = None
+            for j in range(len(tokens_lst)):
+                curr_offsets = offsets[j]
+                curr_slice = sentence[curr_offsets[0] : curr_offsets[1]]
+                if not curr_slice:
+                    token_type_ids[-1].append(0)
+                # Check if new token still belongs to same word
+                elif (
+                    curr_word_offsets
+                    and curr_offsets[0] >= curr_word_offsets[0]
+                    and curr_offsets[1] <= curr_word_offsets[1]
+                ):
+                    # Extend previous token type
+                    token_type_ids[-1].append(token_type_ids[-1][-1])
+                else:
+                    curr_word_offsets = self._find_word(sentence, start=curr_offsets[0])
+                    curr_word = sentence[curr_word_offsets[0] : curr_word_offsets[1]]
+                    token_type_ids[-1].append(
+                        int(curr_word != "" and curr_word == verb)
+                    )
+            model_inputs[i]["token_type_ids"] = torch.tensor(
+                token_type_ids, dtype=torch.long
+            )
+        return model_inputs
+    def _forward(self, model_inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Internal method to forward model inputs for prediction.
+        Parameters:
+        - model_inputs ``List[Dict[str, Any]]``: List of dictionaries containing model inputs.
+        Returns:
+        - ``List[Dict[str, Any]]``: List of dictionaries containing model outputs.
+        """
+        outputs = []
+        for model_input in model_inputs:
+            output = self.model(
+                input_ids=model_input["input_ids"],
+                attention_mask=model_input["attention_mask"],
+                token_type_ids=model_input["token_type_ids"],
+            )
+            output["verb"] = model_input["verb"]
+            output["tokens"] = model_input["tokens"]
+            outputs.append(output)
+        return outputs
+    def postprocess(self, model_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Postprocesses model outputs to human-readable format.
+        Parameters:
+        - model_outputs ``List[Dict[str, Any]]``: List of dictionaries containing model outputs.
+        Returns:
+        - ``List[Dict[str, Any]]``: List of dictionaries containing processed results.
+                Each dictionary entry represents a verb with its associated labels and token-label pairs.
+                Example format: {verb: (labels, List[(token, label)])}
+        """
+        result = []
+        id2label = {int(k): str(v) for k, v in self.model.config.id2label.items()}
+        evaluator = Decoder(id2label)
+        for model_output in model_outputs:
+            class_probabilities = model_output["class_probabilities"]
+            attention_mask = model_output["attention_mask"]
+            output_dict = evaluator.make_output_human_readable(
+                class_probabilities, attention_mask
+            )
+            # Here we always fetch the first list because in a pipeline every
+            # sentence is processed one at a time
+            wordpiece_label_ids = output_dict["wordpiece_label_ids"][0]
+            labels = list(map(lambda idx: id2label[idx], wordpiece_label_ids))
+            result.append(
+                {
+                    model_output["verb"]: (
+                        labels,
+                        list(zip(model_output["tokens"], labels)),
+                    )
+                }
+            )
+        return result
+    def _find_word(self, s: str, start: int = 0) -> Tuple[int, int]:
+        """
+        Helper method to find the boundaries of a word in a string.
+        Assumes a non alphanumeric char represents the end of a word.
+        Parameters:
+        - s ``str``: The input string.
+        - start ``int``, optional: Starting index to start looking for the word. Defaults to 0.
+        Returns:
+        - ``Tuple[int, int]``: Start and end indices of the word.
+        """
+        for i, char in enumerate(s[start:], start):
+            if not char.isalpha():
+                return start, i
+        return start, len(s)