import logging from typing import Any, Dict, List, Tuple import spacy import torch from transformers import Pipeline from decoder import Decoder logger = logging.getLogger(__name__) class SrlPipeline(Pipeline): """ A pipeline for Semantic Role Labeling (SRL) using transformers and spaCy models. This pipeline tokenizes input sentences, finds verbs using POS tagging, and postprocesses the model outputs using Viterbi decoding to provide human-readable results. Attributes: model ``str``: The name or identifier of the underlying transformer model. tokenizer ``str``: The name or identifier of the tokenizer associated with the model. framework ``str``: The framework used for the pipeline (e.g., PyTorch, TensorFlow). task ``str``: The specific task of the pipeline. verb_predictor: An instance of spaCy model used for predicting verbs in the input sentences. Usage: # Register the SrlPipeline in the pipeline registry PIPELINE_REGISTRY.register_pipeline( "srl", pipeline_class=SrlPipeline, model=SRLModel, # Assuming SRLModel is the model class used default={"lang": "en"}, type="text", ) # Load the model and tokenizer model = AutoModel.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) # Load the SRL pipeline srl_pipeline = pipeline( "srl", model=model, tokenizer=tokenizer, framework="PyTorch", # Replace with actual framework used task="semantic_role_labeling", # Replace with actual task name lang="en" # Language specification ) # Example text input text = ["The cat jumps over the fence.", "She quickly eats the delicious cake."] # Perform semantic role labeling results = srl_pipeline(text) """ def __init__(self, model: str, tokenizer: str, framework: str, task: str, **kwargs): """ Initializes the Semantic Role Labeling pipeline. Parameters: - model ``str``: The model name or identifier. - tokenizer ``str``: The tokenizer name or identifier. - framework ``str``: The framework used. - task ``str``: The specific task of the pipeline. - **kwargs: Additional keyword arguments. - lang ``str``, optional: Language specification ('en' for English or 'pt' for Portuguese, which is default). """ super().__init__(model, tokenizer=tokenizer) if "lang" in kwargs and kwargs["lang"] == "en": logger.info("Loading English verb predictor model...") self.verb_predictor = spacy.load("en_core_web_trf") else: logger.info("Loading Portuguese verb predictor model...") self.verb_predictor = spacy.load("pt_core_news_lg") logger.info("Got verb prediction model\n") def _sanitize_parameters( self, **kwargs: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]: """ Sanitizes and organizes additional parameters. Parameters: - **kwargs: Additional keyword arguments. Returns: - ``Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]``: Three dictionaries of sanitized parameters for preprocess, _forward, and postprocess. """ return {}, {}, {} def preprocess(self, sentence: str) -> List[Dict[str, Any]]: """ Preprocesses a sentence for semantic role labeling. Parameters: - sentence ``str``: The input sentence to be processed. Returns: - ``List[Dict[str, Any]]``: A list of dictionaries containing model inputs for each verb in the sentence. """ # Extract sentence verbs doc = self.verb_predictor(sentence) verbs = {token.text for token in doc if token.pos_ == "VERB"} # If the sentence only contains auxiliary verbs, consider those as the # main verbs if not verbs: verbs = {token.text for token in doc if token.pos_ == "AUX"} # Tokenize sentence tokens = self.tokenizer.encode_plus( sentence, truncation=True, return_token_type_ids=False, return_offsets_mapping=True, ) tokens_lst = tokens.tokens() offsets = tokens["offset_mapping"] input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long) attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long) model_input = { "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": [], "tokens": tokens_lst, "verb": "", } model_inputs = [ {**model_input} for _ in verbs ] # Create a new dictionary for each verb for i, verb in enumerate(verbs): model_inputs[i]["verb"] = verb token_type_ids = model_inputs[i]["token_type_ids"] token_type_ids.append([]) curr_word_offsets: tuple[int, int] = None for j in range(len(tokens_lst)): curr_offsets = offsets[j] curr_slice = sentence[curr_offsets[0] : curr_offsets[1]] if not curr_slice: token_type_ids[-1].append(0) # Check if new token still belongs to same word elif ( curr_word_offsets and curr_offsets[0] >= curr_word_offsets[0] and curr_offsets[1] <= curr_word_offsets[1] ): # Extend previous token type token_type_ids[-1].append(token_type_ids[-1][-1]) else: curr_word_offsets = self._find_word(sentence, start=curr_offsets[0]) curr_word = sentence[curr_word_offsets[0] : curr_word_offsets[1]] token_type_ids[-1].append( int(curr_word != "" and curr_word == verb) ) model_inputs[i]["token_type_ids"] = torch.tensor( token_type_ids, dtype=torch.long ) return model_inputs def _forward(self, model_inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Internal method to forward model inputs for prediction. Parameters: - model_inputs ``List[Dict[str, Any]]``: List of dictionaries containing model inputs. Returns: - ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. """ outputs = [] for model_input in model_inputs: output = self.model( input_ids=model_input["input_ids"], attention_mask=model_input["attention_mask"], token_type_ids=model_input["token_type_ids"], ) output["verb"] = model_input["verb"] output["tokens"] = model_input["tokens"] outputs.append(output) return outputs def postprocess(self, model_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Postprocesses model outputs to human-readable format. Parameters: - model_outputs ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. Returns: - ``List[Dict[str, Any]]``: List of dictionaries containing processed results. Each dictionary entry represents a verb with its associated labels and token-label pairs. Example format: {verb: (labels, List[(token, label)])} """ result = [] id2label = {int(k): str(v) for k, v in self.model.config.id2label.items()} evaluator = Decoder(id2label) for model_output in model_outputs: class_probabilities = model_output["class_probabilities"] attention_mask = model_output["attention_mask"] output_dict = evaluator.make_output_human_readable( class_probabilities, attention_mask ) # Here we always fetch the first list because in a pipeline every # sentence is processed one at a time wordpiece_label_ids = output_dict["wordpiece_label_ids"][0] labels = list(map(lambda idx: id2label[idx], wordpiece_label_ids)) result.append( { model_output["verb"]: ( labels, list(zip(model_output["tokens"], labels)), ) } ) return result def _find_word(self, s: str, start: int = 0) -> Tuple[int, int]: """ Helper method to find the boundaries of a word in a string. Assumes a non alphanumeric char represents the end of a word. Parameters: - s ``str``: The input string. - start ``int``, optional: Starting index to start looking for the word. Defaults to 0. Returns: - ``Tuple[int, int]``: Start and end indices of the word. """ for i, char in enumerate(s[start:], start): if not char.isalpha(): return start, i return start, len(s)