import logging |
from typing import Any, Dict, List, Tuple |
import spacy |
import torch |
from transformers import Pipeline |
from decoder import Decoder |
logger = logging.getLogger(__name__) |
class SrlPipeline(Pipeline): |
""" |
A pipeline for Semantic Role Labeling (SRL) using transformers and spaCy models. |
This pipeline tokenizes input sentences, finds verbs using POS tagging, and postprocesses |
the model outputs using Viterbi decoding to provide human-readable results. |
Attributes: |
model ``str``: The name or identifier of the underlying transformer model. |
tokenizer ``str``: The name or identifier of the tokenizer associated with the model. |
framework ``str``: The framework used for the pipeline (e.g., PyTorch, TensorFlow). |
task ``str``: The specific task of the pipeline. |
verb_predictor: An instance of spaCy model used for predicting verbs in the input sentences. |
Usage: |
# Register the SrlPipeline in the pipeline registry |
PIPELINE_REGISTRY.register_pipeline( |
"srl", |
pipeline_class=SrlPipeline, |
model=SRLModel, # Assuming SRLModel is the model class used |
default={"lang": "en"}, |
type="text", |
) |
# Load the model and tokenizer |
model = AutoModel.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) |
tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) |
# Load the SRL pipeline |
srl_pipeline = pipeline( |
"srl", |
model=model, |
tokenizer=tokenizer, |
framework="PyTorch", # Replace with actual framework used |
task="semantic_role_labeling", # Replace with actual task name |
lang="en" # Language specification |
) |
# Example text input |
text = ["The cat jumps over the fence.", "She quickly eats the delicious cake."] |
# Perform semantic role labeling |
results = srl_pipeline(text) |
""" |
def __init__(self, model: str, tokenizer: str, framework: str, task: str, **kwargs): |
""" |
Initializes the Semantic Role Labeling pipeline. |
Parameters: |
- model ``str``: The model name or identifier. |
- tokenizer ``str``: The tokenizer name or identifier. |
- framework ``str``: The framework used. |
- task ``str``: The specific task of the pipeline. |
- **kwargs: Additional keyword arguments. |
- lang ``str``, optional: Language specification ('en' for English or 'pt' for Portuguese, which is default). |
""" |
super().__init__(model, tokenizer=tokenizer) |
if "lang" in kwargs and kwargs["lang"] == "en": |
logger.info("Loading English verb predictor model...") |
self.verb_predictor = spacy.load("en_core_web_trf") |
else: |
logger.info("Loading Portuguese verb predictor model...") |
self.verb_predictor = spacy.load("pt_core_news_lg") |
logger.info("Got verb prediction model\n") |
def _sanitize_parameters( |
self, **kwargs: Dict[str, Any] |
) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]: |
""" |
Sanitizes and organizes additional parameters. |
Parameters: |
- **kwargs: Additional keyword arguments. |
Returns: |
- ``Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]``: Three dictionaries of sanitized parameters for preprocess, _forward, and postprocess. |
""" |
return {}, {}, {} |
def preprocess(self, sentence: str) -> List[Dict[str, Any]]: |
""" |
Preprocesses a sentence for semantic role labeling. |
Parameters: |
- sentence ``str``: The input sentence to be processed. |
Returns: |
- ``List[Dict[str, Any]]``: A list of dictionaries containing model inputs for each verb in the sentence. |
""" |
doc = self.verb_predictor(sentence) |
verbs = {token.text for token in doc if token.pos_ == "VERB"} |
if not verbs: |
verbs = {token.text for token in doc if token.pos_ == "AUX"} |
tokens = self.tokenizer.encode_plus( |
sentence, |
truncation=True, |
return_token_type_ids=False, |
return_offsets_mapping=True, |
) |
tokens_lst = tokens.tokens() |
offsets = tokens["offset_mapping"] |
input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long) |
attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long) |
model_input = { |
"input_ids": input_ids, |
"attention_mask": attention_mask, |
"token_type_ids": [], |
"tokens": tokens_lst, |
"verb": "", |
} |
model_inputs = [ |
{**model_input} for _ in verbs |
] |
for i, verb in enumerate(verbs): |
model_inputs[i]["verb"] = verb |
token_type_ids = model_inputs[i]["token_type_ids"] |
token_type_ids.append([]) |
curr_word_offsets: tuple[int, int] = None |
for j in range(len(tokens_lst)): |
curr_offsets = offsets[j] |
curr_slice = sentence[curr_offsets[0] : curr_offsets[1]] |
if not curr_slice: |
token_type_ids[-1].append(0) |
elif ( |
curr_word_offsets |
and curr_offsets[0] >= curr_word_offsets[0] |
and curr_offsets[1] <= curr_word_offsets[1] |
): |
token_type_ids[-1].append(token_type_ids[-1][-1]) |
else: |
curr_word_offsets = self._find_word(sentence, start=curr_offsets[0]) |
curr_word = sentence[curr_word_offsets[0] : curr_word_offsets[1]] |
token_type_ids[-1].append( |
int(curr_word != "" and curr_word == verb) |
) |
model_inputs[i]["token_type_ids"] = torch.tensor( |
token_type_ids, dtype=torch.long |
) |
return model_inputs |
def _forward(self, model_inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
""" |
Internal method to forward model inputs for prediction. |
Parameters: |
- model_inputs ``List[Dict[str, Any]]``: List of dictionaries containing model inputs. |
Returns: |
- ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. |
""" |
outputs = [] |
for model_input in model_inputs: |
output = self.model( |
input_ids=model_input["input_ids"], |
attention_mask=model_input["attention_mask"], |
token_type_ids=model_input["token_type_ids"], |
) |
output["verb"] = model_input["verb"] |
output["tokens"] = model_input["tokens"] |
outputs.append(output) |
return outputs |
def postprocess(self, model_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
""" |
Postprocesses model outputs to human-readable format. |
Parameters: |
- model_outputs ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. |
Returns: |
- ``List[Dict[str, Any]]``: List of dictionaries containing processed results. |
Each dictionary entry represents a verb with its associated labels and token-label pairs. |
Example format: {verb: (labels, List[(token, label)])} |
""" |
result = [] |
id2label = {int(k): str(v) for k, v in self.model.config.id2label.items()} |
evaluator = Decoder(id2label) |
for model_output in model_outputs: |
class_probabilities = model_output["class_probabilities"] |
attention_mask = model_output["attention_mask"] |
output_dict = evaluator.make_output_human_readable( |
class_probabilities, attention_mask |
) |
wordpiece_label_ids = output_dict["wordpiece_label_ids"][0] |
labels = list(map(lambda idx: id2label[idx], wordpiece_label_ids)) |
result.append( |
{ |
model_output["verb"]: ( |
labels, |
list(zip(model_output["tokens"], labels)), |
) |
} |
) |
return result |
def _find_word(self, s: str, start: int = 0) -> Tuple[int, int]: |
""" |
Helper method to find the boundaries of a word in a string. |
Assumes a non alphanumeric char represents the end of a word. |
Parameters: |
- s ``str``: The input string. |
- start ``int``, optional: Starting index to start looking for the word. Defaults to 0. |
Returns: |
- ``Tuple[int, int]``: Start and end indices of the word. |
""" |
for i, char in enumerate(s[start:], start): |
if not char.isalpha(): |
return start, i |
return start, len(s) |