|
import logging |
|
from typing import Any, Dict, List, Tuple |
|
|
|
import spacy |
|
import torch |
|
from transformers import Pipeline |
|
|
|
from decoder import Decoder |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class SrlPipeline(Pipeline): |
|
""" |
|
A pipeline for Semantic Role Labeling (SRL) using transformers and spaCy models. |
|
|
|
This pipeline tokenizes input sentences, finds verbs using POS tagging, and postprocesses |
|
the model outputs using Viterbi decoding to provide human-readable results. |
|
|
|
Attributes: |
|
model ``str``: The name or identifier of the underlying transformer model. |
|
tokenizer ``str``: The name or identifier of the tokenizer associated with the model. |
|
framework ``str``: The framework used for the pipeline (e.g., PyTorch, TensorFlow). |
|
task ``str``: The specific task of the pipeline. |
|
verb_predictor: An instance of spaCy model used for predicting verbs in the input sentences. |
|
Usage: |
|
# Register the SrlPipeline in the pipeline registry |
|
PIPELINE_REGISTRY.register_pipeline( |
|
"srl", |
|
pipeline_class=SrlPipeline, |
|
model=SRLModel, # Assuming SRLModel is the model class used |
|
default={"lang": "en"}, |
|
type="text", |
|
) |
|
|
|
# Load the model and tokenizer |
|
model = AutoModel.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_roberta-large_hf", trust_remote_code=True) |
|
|
|
# Load the SRL pipeline |
|
srl_pipeline = pipeline( |
|
"srl", |
|
model=model, |
|
tokenizer=tokenizer, |
|
framework="PyTorch", # Replace with actual framework used |
|
task="semantic_role_labeling", # Replace with actual task name |
|
lang="en" # Language specification |
|
) |
|
|
|
# Example text input |
|
text = ["The cat jumps over the fence.", "She quickly eats the delicious cake."] |
|
|
|
# Perform semantic role labeling |
|
results = srl_pipeline(text) |
|
""" |
|
|
|
def __init__(self, model: str, tokenizer: str, framework: str, task: str, **kwargs): |
|
""" |
|
Initializes the Semantic Role Labeling pipeline. |
|
|
|
Parameters: |
|
- model ``str``: The model name or identifier. |
|
- tokenizer ``str``: The tokenizer name or identifier. |
|
- framework ``str``: The framework used. |
|
- task ``str``: The specific task of the pipeline. |
|
- **kwargs: Additional keyword arguments. |
|
- lang ``str``, optional: Language specification ('en' for English or 'pt' for Portuguese, which is default). |
|
""" |
|
super().__init__(model, tokenizer=tokenizer) |
|
if "lang" in kwargs and kwargs["lang"] == "en": |
|
logger.info("Loading English verb predictor model...") |
|
self.verb_predictor = spacy.load("en_core_web_trf") |
|
else: |
|
logger.info("Loading Portuguese verb predictor model...") |
|
self.verb_predictor = spacy.load("pt_core_news_lg") |
|
logger.info("Got verb prediction model\n") |
|
|
|
def _sanitize_parameters( |
|
self, **kwargs: Dict[str, Any] |
|
) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]: |
|
""" |
|
Sanitizes and organizes additional parameters. |
|
|
|
Parameters: |
|
- **kwargs: Additional keyword arguments. |
|
|
|
Returns: |
|
- ``Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]``: Three dictionaries of sanitized parameters for preprocess, _forward, and postprocess. |
|
""" |
|
return {}, {}, {} |
|
|
|
def preprocess(self, sentence: str) -> List[Dict[str, Any]]: |
|
""" |
|
Preprocesses a sentence for semantic role labeling. |
|
|
|
Parameters: |
|
- sentence ``str``: The input sentence to be processed. |
|
|
|
Returns: |
|
- ``List[Dict[str, Any]]``: A list of dictionaries containing model inputs for each verb in the sentence. |
|
""" |
|
|
|
doc = self.verb_predictor(sentence) |
|
|
|
verbs = {token.text for token in doc if token.pos_ == "VERB"} |
|
|
|
|
|
if not verbs: |
|
verbs = {token.text for token in doc if token.pos_ == "AUX"} |
|
|
|
|
|
tokens = self.tokenizer.encode_plus( |
|
sentence, |
|
truncation=True, |
|
return_token_type_ids=False, |
|
return_offsets_mapping=True, |
|
) |
|
tokens_lst = tokens.tokens() |
|
offsets = tokens["offset_mapping"] |
|
|
|
input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long) |
|
attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long) |
|
|
|
model_input = { |
|
"input_ids": input_ids, |
|
"attention_mask": attention_mask, |
|
"token_type_ids": [], |
|
"tokens": tokens_lst, |
|
"verb": "", |
|
} |
|
|
|
model_inputs = [ |
|
{**model_input} for _ in verbs |
|
] |
|
|
|
for i, verb in enumerate(verbs): |
|
model_inputs[i]["verb"] = verb |
|
token_type_ids = model_inputs[i]["token_type_ids"] |
|
token_type_ids.append([]) |
|
curr_word_offsets: tuple[int, int] = None |
|
|
|
for j in range(len(tokens_lst)): |
|
curr_offsets = offsets[j] |
|
curr_slice = sentence[curr_offsets[0] : curr_offsets[1]] |
|
if not curr_slice: |
|
token_type_ids[-1].append(0) |
|
|
|
elif ( |
|
curr_word_offsets |
|
and curr_offsets[0] >= curr_word_offsets[0] |
|
and curr_offsets[1] <= curr_word_offsets[1] |
|
): |
|
|
|
token_type_ids[-1].append(token_type_ids[-1][-1]) |
|
else: |
|
curr_word_offsets = self._find_word(sentence, start=curr_offsets[0]) |
|
curr_word = sentence[curr_word_offsets[0] : curr_word_offsets[1]] |
|
|
|
token_type_ids[-1].append( |
|
int(curr_word != "" and curr_word == verb) |
|
) |
|
|
|
model_inputs[i]["token_type_ids"] = torch.tensor( |
|
token_type_ids, dtype=torch.long |
|
) |
|
|
|
return model_inputs |
|
|
|
def _forward(self, model_inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
""" |
|
Internal method to forward model inputs for prediction. |
|
|
|
Parameters: |
|
- model_inputs ``List[Dict[str, Any]]``: List of dictionaries containing model inputs. |
|
|
|
Returns: |
|
- ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. |
|
""" |
|
outputs = [] |
|
for model_input in model_inputs: |
|
output = self.model( |
|
input_ids=model_input["input_ids"], |
|
attention_mask=model_input["attention_mask"], |
|
token_type_ids=model_input["token_type_ids"], |
|
) |
|
output["verb"] = model_input["verb"] |
|
output["tokens"] = model_input["tokens"] |
|
outputs.append(output) |
|
return outputs |
|
|
|
def postprocess(self, model_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
""" |
|
Postprocesses model outputs to human-readable format. |
|
|
|
Parameters: |
|
- model_outputs ``List[Dict[str, Any]]``: List of dictionaries containing model outputs. |
|
|
|
Returns: |
|
- ``List[Dict[str, Any]]``: List of dictionaries containing processed results. |
|
Each dictionary entry represents a verb with its associated labels and token-label pairs. |
|
Example format: {verb: (labels, List[(token, label)])} |
|
""" |
|
result = [] |
|
id2label = {int(k): str(v) for k, v in self.model.config.id2label.items()} |
|
evaluator = Decoder(id2label) |
|
|
|
for model_output in model_outputs: |
|
class_probabilities = model_output["class_probabilities"] |
|
attention_mask = model_output["attention_mask"] |
|
output_dict = evaluator.make_output_human_readable( |
|
class_probabilities, attention_mask |
|
) |
|
|
|
|
|
wordpiece_label_ids = output_dict["wordpiece_label_ids"][0] |
|
labels = list(map(lambda idx: id2label[idx], wordpiece_label_ids)) |
|
result.append( |
|
{ |
|
model_output["verb"]: ( |
|
labels, |
|
list(zip(model_output["tokens"], labels)), |
|
) |
|
} |
|
) |
|
return result |
|
|
|
def _find_word(self, s: str, start: int = 0) -> Tuple[int, int]: |
|
""" |
|
Helper method to find the boundaries of a word in a string. |
|
Assumes a non alphanumeric char represents the end of a word. |
|
|
|
Parameters: |
|
- s ``str``: The input string. |
|
- start ``int``, optional: Starting index to start looking for the word. Defaults to 0. |
|
|
|
Returns: |
|
- ``Tuple[int, int]``: Start and end indices of the word. |
|
""" |
|
for i, char in enumerate(s[start:], start): |
|
if not char.isalpha(): |
|
return start, i |
|
return start, len(s) |
|
|