AnnaWegmann commited on
Commit
fb606dd
1 Parent(s): da918e5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +142 -1
README.md CHANGED
@@ -8,7 +8,148 @@ base_model:
8
  - microsoft/deberta-v3-large
9
  ---
10
 
11
- Model was created as described in https://arxiv.org/abs/2404.06670 , this is the best `DeBERTa ALL` model.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  ```
14
  @article{wegmann2024,
 
8
  - microsoft/deberta-v3-large
9
  ---
10
 
11
+ Model was created as described in https://arxiv.org/abs/2404.06670 , this is the best `DeBERTa ALL` model. See also the [GitHub](https://github.com/nlpsoc/Paraphrases-in-News-Interviews) repository.
12
+
13
+ ```python
14
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
15
+ import torch
16
+
17
+ class ParaphraseHighlighter:
18
+ def __init__(self, model_name="AnnaWegmann/Highlight-Paraphrases-in-Dialog-ALL"):
19
+ # Load the tokenizer and model
20
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
21
+ self.model = AutoModelForTokenClassification.from_pretrained(model_name)
22
+
23
+ # Get the label id for 'LABEL_1'
24
+ self.label2id = self.model.config.label2id
25
+ self.label_id = self.label2id['LABEL_1']
26
+
27
+ def highlight_paraphrase(self, text1, text2):
28
+ # Tokenize the inputs with the tokenizer
29
+ encoding = self.tokenizer(text1, text2, return_tensors="pt", padding=True, truncation=True)
30
+
31
+ outputs = self.model(**encoding)
32
+ logits = outputs.logits # Shape: (batch_size, sequence_length, num_labels)
33
+ # Apply softmax to get probabilities, automatically places [SEP] token
34
+ probs = torch.nn.functional.softmax(logits, dim=-1) # Shape: (batch_size, sequence_length, num_labels)
35
+
36
+ # Convert token IDs back to tokens
37
+ tokens = self.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
38
+ # Get word IDs to map tokens to words
39
+ word_ids = encoding.word_ids(batch_index=0)
40
+ # Get sequence IDs to know which text the token belongs to
41
+ sequence_ids = encoding.sequence_ids(batch_index=0)
42
+
43
+ # Collect words and probabilities for each text
44
+ words_text1 = []
45
+ words_text2 = []
46
+ probs_text1 = []
47
+ probs_text2 = []
48
+
49
+ previous_word_idx = None
50
+
51
+ # For determining if there are high-probability words in both texts
52
+ has_high_prob_text1 = False
53
+ has_high_prob_text2 = False
54
+
55
+ for idx, (word_idx, seq_id) in enumerate(zip(word_ids, sequence_ids)):
56
+ if word_idx is None:
57
+ # Skip special tokens like [CLS], [SEP], [PAD]
58
+ continue
59
+
60
+ if word_idx != previous_word_idx:
61
+ # Start of a new word
62
+ word_tokens = [tokens[idx]]
63
+
64
+ # Get the probability for LABEL_1 for the first token of the word
65
+ prob_LABEL_1 = probs[0][idx][self.label_id].item()
66
+
67
+ # Collect subsequent tokens belonging to the same word
68
+ j = idx + 1
69
+ while j < len(word_ids) and word_ids[j] == word_idx:
70
+ word_tokens.append(tokens[j])
71
+ j += 1
72
+
73
+ # Reconstruct the word
74
+ word = self.tokenizer.convert_tokens_to_string(word_tokens).strip()
75
+
76
+ # Check if probability >= 0.5 to uppercase
77
+ if prob_LABEL_1 >= 0.5:
78
+ word_display = word.upper()
79
+ if seq_id == 0:
80
+ has_high_prob_text1 = True
81
+ elif seq_id == 1:
82
+ has_high_prob_text2 = True
83
+ else:
84
+ word_display = word
85
+
86
+ # Append the word and probability to the appropriate list
87
+ if seq_id == 0:
88
+ words_text1.append(word_display)
89
+ probs_text1.append(prob_LABEL_1)
90
+ elif seq_id == 1:
91
+ words_text2.append(word_display)
92
+ probs_text2.append(prob_LABEL_1)
93
+ else:
94
+ # Should not happen
95
+ pass
96
+
97
+ previous_word_idx = word_idx
98
+
99
+ # Determine if there are words in both texts with prob >= 0.5
100
+ if has_high_prob_text1 and has_high_prob_text2:
101
+ print("is a paraphrase")
102
+ else:
103
+ print("is not a paraphrase")
104
+
105
+ # Function to format and align words and probabilities
106
+ def print_aligned(words, probs):
107
+ # Determine the maximum length of words for formatting
108
+ max_word_length = max(len(word) for word in words)
109
+ # Create format string for alignment
110
+ format_str = f'{{:<{max_word_length}}}'
111
+ # Print words
112
+ for word in words:
113
+ print(format_str.format(word), end=' ')
114
+ print()
115
+ # Print probabilities aligned below words
116
+ for prob in probs:
117
+ prob_str = f"{prob:.2f}"
118
+ print(format_str.format(prob_str), end=' ')
119
+ print('\n')
120
+
121
+ # Print text1's words and probabilities aligned
122
+ print("\nSpeaker 1:")
123
+ print_aligned(words_text1, probs_text1)
124
+
125
+ # Print text2's words and probabilities aligned
126
+ print("Speaker 2:")
127
+ print_aligned(words_text2, probs_text2)
128
+
129
+ # Example usage
130
+ highlighter = ParaphraseHighlighter()
131
+ text1 = "And it will be my 20th time in doing it as a television commentator from Rome so."
132
+ text2 = "Yes, you've been doing this for a while now."
133
+ highlighter.highlight_paraphrase(text1, text2)
134
+ ```
135
+
136
+ should return
137
+
138
+ ```
139
+ is a paraphrase
140
+
141
+ Speaker 1:
142
+ And it will be my 20TH TIME IN DOING IT as a television commentator from Rome so.
143
+ 0.06 0.38 0.35 0.37 0.45 0.60 0.51 0.51 0.51 0.59 0.38 0.37 0.42 0.38 0.24 0.26 0.14
144
+
145
+ Speaker 2:
146
+ Yes, YOU'VE BEEN DOING THIS FOR A WHILE now.
147
+ 0.07 0.60 0.65 0.63 0.68 0.62 0.60 0.64 0.48
148
+ ```
149
+
150
+
151
+
152
+ If you find this model helpful, consider citing our paper:
153
 
154
  ```
155
  @article{wegmann2024,