File size: 6,380 Bytes
1068f10 bd054e7 38b9330 bd054e7 38b9330 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
---
license: cc-by-4.0
datasets:
- AnnaWegmann/Dialog-Paraphrase
language:
- en
base_model: microsoft/deberta-v3-large
---
Model was created as described in https://arxiv.org/abs/2404.06670 , this is the best `DeBERTa AGGREGATED` model. See also the [GitHub](https://github.com/nlpsoc/Paraphrases-in-News-Interviews) repository.
```python
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
class ParaphraseHighlighter:
def __init__(self, model_name="AnnaWegmann/Highlight-Paraphrases-in-Dialog"):
# Load the tokenizer and model
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
# Get the label id for 'LABEL_1'
self.label2id = self.model.config.label2id
self.label_id = self.label2id['LABEL_1']
def highlight_paraphrase(self, text1, text2):
# Tokenize the inputs with the tokenizer
encoding = self.tokenizer(text1, text2, return_tensors="pt", padding=True, truncation=True)
outputs = self.model(**encoding)
logits = outputs.logits # Shape: (batch_size, sequence_length, num_labels)
# Apply softmax to get probabilities, automatically places [SEP] token
probs = torch.nn.functional.softmax(logits, dim=-1) # Shape: (batch_size, sequence_length, num_labels)
# Convert token IDs back to tokens
tokens = self.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
# Get word IDs to map tokens to words
word_ids = encoding.word_ids(batch_index=0)
# Get sequence IDs to know which text the token belongs to
sequence_ids = encoding.sequence_ids(batch_index=0)
# Collect words and probabilities for each text
words_text1 = []
words_text2 = []
probs_text1 = []
probs_text2 = []
previous_word_idx = None
# For determining if there are high-probability words in both texts
has_high_prob_text1 = False
has_high_prob_text2 = False
for idx, (word_idx, seq_id) in enumerate(zip(word_ids, sequence_ids)):
if word_idx is None:
# Skip special tokens like [CLS], [SEP], [PAD]
continue
if word_idx != previous_word_idx:
# Start of a new word
word_tokens = [tokens[idx]]
# Get the probability for LABEL_1 for the first token of the word
prob_LABEL_1 = probs[0][idx][self.label_id].item()
# Collect subsequent tokens belonging to the same word
j = idx + 1
while j < len(word_ids) and word_ids[j] == word_idx:
word_tokens.append(tokens[j])
j += 1
# Reconstruct the word
word = self.tokenizer.convert_tokens_to_string(word_tokens).strip()
# Check if probability >= 0.5 to uppercase
if prob_LABEL_1 >= 0.5:
word_display = word.upper()
if seq_id == 0:
has_high_prob_text1 = True
elif seq_id == 1:
has_high_prob_text2 = True
else:
word_display = word
# Append the word and probability to the appropriate list
if seq_id == 0:
words_text1.append(word_display)
probs_text1.append(prob_LABEL_1)
elif seq_id == 1:
words_text2.append(word_display)
probs_text2.append(prob_LABEL_1)
else:
# Should not happen
pass
previous_word_idx = word_idx
# Determine if there are words in both texts with prob >= 0.5
if has_high_prob_text1 and has_high_prob_text2:
print("is a paraphrase")
else:
print("is not a paraphrase")
# Function to format and align words and probabilities
def print_aligned(words, probs):
# Determine the maximum length of words for formatting
max_word_length = max(len(word) for word in words)
# Create format string for alignment
format_str = f'{{:<{max_word_length}}}'
# Print words
for word in words:
print(format_str.format(word), end=' ')
print()
# Print probabilities aligned below words
for prob in probs:
prob_str = f"{prob:.2f}"
print(format_str.format(prob_str), end=' ')
print('\n')
# Print text1's words and probabilities aligned
print("\nSpeaker 1:")
print_aligned(words_text1, probs_text1)
# Print text2's words and probabilities aligned
print("Speaker 2:")
print_aligned(words_text2, probs_text2)
# Example usage
highlighter = ParaphraseHighlighter()
text1 = "And it will be my 20th time in doing it as a television commentator from Rome so."
text2 = "Yes, you've been doing this for a while now."
highlighter.highlight_paraphrase(text1, text2)
```
should return
```
is a paraphrase
Speaker 1:
And IT will BE MY 20TH TIME IN DOING IT as a TELEVISION COMMENTATOR from Rome so.
0.15 0.54 0.49 0.56 0.74 0.83 0.77 0.75 0.78 0.76 0.44 0.45 0.52 0.52 0.30 0.37 0.21
Speaker 2:
Yes, YOU'VE BEEN DOING THIS FOR A WHILE NOW.
0.12 0.79 0.78 0.82 0.82 0.69 0.70 0.72 0.66
```
For comments or questions reach out to Anna (a.m.wegmann @ uu.nl) or raise an issue on GitHub.
If you find this model helpful, consider citing our paper:
```
@article{wegmann2024,
title={What's Mine becomes Yours: Defining, Annotating and Detecting Context-Dependent Paraphrases in News Interview Dialogs},
author={Wegmann, Anna and Broek, Tijs van den and Nguyen, Dong},
journal={arXiv preprint arXiv:2404.06670},
year={2024}
}
``` |