Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app.py +15 -26
- entailment.py +1 -1
- highlighter.py +15 -1
- lcs.py +3 -14
- non_melting_points.py +137 -0
- paraphraser.py +1 -1
- twokenize.py +317 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
|
|
5 |
import time
|
6 |
from tree import generate_subplot1, generate_subplot2
|
7 |
from paraphraser import generate_paraphrase
|
8 |
-
from lcs import find_common_subsequences, find_common_gram_positions
|
9 |
from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
|
10 |
from entailment import analyze_entailment
|
11 |
from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
|
@@ -14,7 +14,9 @@ from detectability import SentenceDetectabilityCalculator
|
|
14 |
from distortion import SentenceDistortionCalculator
|
15 |
from euclidean_distance import SentenceEuclideanDistanceCalculator
|
16 |
from threeD_plot import gen_three_D_plot
|
17 |
-
|
|
|
|
|
18 |
|
19 |
class WatermarkingPipeline:
|
20 |
def __init__(self):
|
@@ -39,7 +41,6 @@ class WatermarkingPipeline:
|
|
39 |
def step1_paraphrasing(self, prompt, threshold=0.7):
|
40 |
start_time = time.time()
|
41 |
|
42 |
-
# Existing step1 code...
|
43 |
self.user_prompt = prompt
|
44 |
self.paraphrased_sentences = generate_paraphrase(prompt)
|
45 |
if self.paraphrased_sentences is None:
|
@@ -47,15 +48,17 @@ class WatermarkingPipeline:
|
|
47 |
|
48 |
self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
|
49 |
analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
|
50 |
-
|
51 |
-
self.
|
52 |
-
self.
|
53 |
-
self.
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
59 |
|
60 |
highlighted_user_prompt = highlight_common_words(
|
61 |
self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
|
@@ -227,9 +230,6 @@ class WatermarkingPipeline:
|
|
227 |
|
228 |
return three_D_plot, time_info
|
229 |
|
230 |
-
def step6_sankey(self):
|
231 |
-
return generate_sankey_diagram()
|
232 |
-
|
233 |
def create_gradio_interface():
|
234 |
pipeline = WatermarkingPipeline()
|
235 |
|
@@ -289,11 +289,6 @@ def create_gradio_interface():
|
|
289 |
gr.Markdown("### 3D Visualization of Metrics")
|
290 |
three_D_plot = gr.Plot()
|
291 |
step5_time = gr.Textbox(label="Execution Time", interactive=False)
|
292 |
-
|
293 |
-
# Sankey Diagram
|
294 |
-
gr.Markdown("# Watermarking Pipeline Flow Visualization")
|
295 |
-
generate_button = gr.Button("Generate Sankey Diagram")
|
296 |
-
sankey_plot = gr.Plot()
|
297 |
|
298 |
paraphrase_button.click(
|
299 |
pipeline.step1_paraphrasing,
|
@@ -325,12 +320,6 @@ def create_gradio_interface():
|
|
325 |
inputs=None,
|
326 |
outputs=[three_D_plot, step5_time]
|
327 |
)
|
328 |
-
|
329 |
-
generate_button.click(
|
330 |
-
pipeline.step6_sankey,
|
331 |
-
inputs=None,
|
332 |
-
outputs=sankey_plot
|
333 |
-
)
|
334 |
|
335 |
return demo
|
336 |
|
|
|
5 |
import time
|
6 |
from tree import generate_subplot1, generate_subplot2
|
7 |
from paraphraser import generate_paraphrase
|
8 |
+
# from lcs import find_common_subsequences, find_common_gram_positions
|
9 |
from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
|
10 |
from entailment import analyze_entailment
|
11 |
from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
|
|
|
14 |
from distortion import SentenceDistortionCalculator
|
15 |
from euclidean_distance import SentenceEuclideanDistanceCalculator
|
16 |
from threeD_plot import gen_three_D_plot
|
17 |
+
|
18 |
+
from twokenize import tokenize_sentences, tokenize_sentence
|
19 |
+
from non_melting_points import find_non_melting_points
|
20 |
|
21 |
class WatermarkingPipeline:
|
22 |
def __init__(self):
|
|
|
41 |
def step1_paraphrasing(self, prompt, threshold=0.7):
|
42 |
start_time = time.time()
|
43 |
|
|
|
44 |
self.user_prompt = prompt
|
45 |
self.paraphrased_sentences = generate_paraphrase(prompt)
|
46 |
if self.paraphrased_sentences is None:
|
|
|
48 |
|
49 |
self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
|
50 |
analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
|
51 |
+
|
52 |
+
self.user_prompt_tokenized = tokenize_sentence(self.user_prompt)
|
53 |
+
self.selected_sentences_tokenized = tokenize_sentences(self.selected_sentences)
|
54 |
+
self.discarded_sentences_tokenized = tokenize_sentences(self.discarded_sentences)
|
55 |
+
|
56 |
+
all_tokenized_sentences = []
|
57 |
+
all_tokenized_sentences.append(self.user_prompt_tokenized)
|
58 |
+
all_tokenized_sentences.extend(self.selected_sentences_tokenized)
|
59 |
+
all_tokenized_sentences.extend(self.discarded_sentences_tokenized)
|
60 |
+
|
61 |
+
self.common_grams = find_non_melting_points(all_tokenized_sentences)
|
62 |
|
63 |
highlighted_user_prompt = highlight_common_words(
|
64 |
self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
|
|
|
230 |
|
231 |
return three_D_plot, time_info
|
232 |
|
|
|
|
|
|
|
233 |
def create_gradio_interface():
|
234 |
pipeline = WatermarkingPipeline()
|
235 |
|
|
|
289 |
gr.Markdown("### 3D Visualization of Metrics")
|
290 |
three_D_plot = gr.Plot()
|
291 |
step5_time = gr.Textbox(label="Execution Time", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
paraphrase_button.click(
|
294 |
pipeline.step1_paraphrasing,
|
|
|
320 |
inputs=None,
|
321 |
outputs=[three_D_plot, step5_time]
|
322 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
return demo
|
325 |
|
entailment.py
CHANGED
@@ -28,4 +28,4 @@ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
|
|
28 |
|
29 |
return all_sentences, selected_sentences, discarded_sentences
|
30 |
|
31 |
-
# print(analyze_entailment("I love you", [""], 0.7))
|
|
|
28 |
|
29 |
return all_sentences, selected_sentences, discarded_sentences
|
30 |
|
31 |
+
# print(analyze_entailment("I love you", ["I like you", "I hate you"], 0.7))
|
highlighter.py
CHANGED
@@ -85,6 +85,7 @@ def highlight_common_words_dict(common_words, sentences, title):
|
|
85 |
</div>
|
86 |
'''
|
87 |
|
|
|
88 |
def reparaphrased_sentences_html(sentences):
|
89 |
|
90 |
formatted_sentences = []
|
@@ -101,4 +102,17 @@ def reparaphrased_sentences_html(sentences):
|
|
101 |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
|
102 |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
|
103 |
</div>
|
104 |
-
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
</div>
|
86 |
'''
|
87 |
|
88 |
+
|
89 |
def reparaphrased_sentences_html(sentences):
|
90 |
|
91 |
formatted_sentences = []
|
|
|
102 |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
|
103 |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
|
104 |
</div>
|
105 |
+
'''
|
106 |
+
|
107 |
+
|
108 |
+
common_words = [(1, "highlight"), (2, "numbering")]
|
109 |
+
sentences = ["This is a test to highlight words.", "Numbering is important for clarity."]
|
110 |
+
|
111 |
+
# Test highlight_common_words
|
112 |
+
highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
|
113 |
+
print(highlighted_html)
|
114 |
+
|
115 |
+
# Test highlight_common_words_dict
|
116 |
+
sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8}
|
117 |
+
highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
|
118 |
+
print(highlighted_html_dict)
|
lcs.py
CHANGED
@@ -71,20 +71,9 @@ def find_common_gram_positions(str_list, common_grams):
|
|
71 |
return positions
|
72 |
|
73 |
|
74 |
-
#
|
75 |
-
# sentence = "
|
76 |
-
# str_list = [
|
77 |
-
# 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
|
78 |
-
# 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
|
79 |
-
# 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
|
80 |
-
# 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
|
81 |
-
# 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
|
82 |
-
# 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
|
83 |
-
# 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
|
84 |
-
# 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
|
85 |
-
# 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
|
86 |
-
# 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
|
87 |
-
# ]
|
88 |
|
89 |
# # Find common subsequences
|
90 |
# common_grams = find_common_subsequences(sentence, str_list)
|
|
|
71 |
return positions
|
72 |
|
73 |
|
74 |
+
# Example usage
|
75 |
+
# sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."
|
76 |
+
# str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# # Find common subsequences
|
79 |
# common_grams = find_common_subsequences(sentence, str_list)
|
non_melting_points.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
|
4 |
+
try:
|
5 |
+
nltk.data.find('corpora/stopwords')
|
6 |
+
except LookupError:
|
7 |
+
nltk.download('stopwords')
|
8 |
+
|
9 |
+
def remove_stopwords(text):
|
10 |
+
"""
|
11 |
+
Remove stopwords using NLTK's stopword list
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text (str): Input text
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: Cleaned text with stopwords removed
|
18 |
+
"""
|
19 |
+
stop_words = set(stopwords.words('english'))
|
20 |
+
words = text.lower().split()
|
21 |
+
return ' '.join([word for word in words if word not in stop_words])
|
22 |
+
|
23 |
+
def is_exact_match(ngram, sentences):
|
24 |
+
"""
|
25 |
+
Check if the given n-gram has an exact match in all sentences
|
26 |
+
|
27 |
+
Args:
|
28 |
+
ngram (str): The n-gram to search for
|
29 |
+
sentences (list): List of sentences to search in
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
bool: True if n-gram has exact match in all sentences, False otherwise
|
33 |
+
"""
|
34 |
+
sentence_ngrams = []
|
35 |
+
for sentence in sentences:
|
36 |
+
words = sentence.split()
|
37 |
+
current_ngrams = []
|
38 |
+
n = len(ngram.split())
|
39 |
+
|
40 |
+
for i in range(len(words) - n + 1):
|
41 |
+
current_ngram = " ".join(words[i:i+n])
|
42 |
+
current_ngrams.append(current_ngram)
|
43 |
+
|
44 |
+
sentence_ngrams.append(set(current_ngrams))
|
45 |
+
|
46 |
+
return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)
|
47 |
+
|
48 |
+
def is_substring_of_any(ngram, common_ngrams):
|
49 |
+
"""
|
50 |
+
Check if the given n-gram is an exact substring of any previously found common n-grams
|
51 |
+
|
52 |
+
Args:
|
53 |
+
ngram (str): The n-gram to check
|
54 |
+
common_ngrams (list): List of previously found common n-grams
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
bool: True if ngram is a substring of any common_ngrams, False otherwise
|
58 |
+
"""
|
59 |
+
ngram_words = ngram.split()
|
60 |
+
for common_gram in common_ngrams:
|
61 |
+
common_words = common_gram.split()
|
62 |
+
for i in range(len(common_words) - len(ngram_words) + 1):
|
63 |
+
if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
|
64 |
+
return True
|
65 |
+
return False
|
66 |
+
|
67 |
+
def find_filtered_ngrams(sentences):
|
68 |
+
"""
|
69 |
+
Find all n-grams that have exact matches across all sentences,
|
70 |
+
excluding those that are part of larger common n-grams
|
71 |
+
|
72 |
+
Args:
|
73 |
+
sentences (list): List of sentences to analyze
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
list: List of all common n-grams in order of their appearance in the first sentence
|
77 |
+
"""
|
78 |
+
# First, remove stopwords from all sentences
|
79 |
+
cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
|
80 |
+
|
81 |
+
words = cleaned_sentences[0].split()
|
82 |
+
max_n = len(words)
|
83 |
+
all_common_ngrams = []
|
84 |
+
|
85 |
+
for n in range(max_n, 0, -1):
|
86 |
+
for i in range(len(words) - n + 1):
|
87 |
+
ngram = " ".join(words[i:i+n])
|
88 |
+
|
89 |
+
if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
|
90 |
+
all_common_ngrams.append(ngram)
|
91 |
+
|
92 |
+
return all_common_ngrams
|
93 |
+
|
94 |
+
def find_relative_order(sentence, common_ngrams):
|
95 |
+
sentence = sentence.lower()
|
96 |
+
ngram_positions = {}
|
97 |
+
|
98 |
+
for ngram in common_ngrams:
|
99 |
+
ngram_lower = ngram.lower()
|
100 |
+
if ngram_lower in sentence:
|
101 |
+
position = sentence.index(ngram_lower)
|
102 |
+
ngram_positions[ngram] = position
|
103 |
+
|
104 |
+
sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
|
105 |
+
|
106 |
+
result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
|
107 |
+
|
108 |
+
return result
|
109 |
+
|
110 |
+
|
111 |
+
def find_non_melting_points(sent_list):
|
112 |
+
|
113 |
+
# Find filtered n-grams
|
114 |
+
common_ngrams = find_filtered_ngrams(sent_list)
|
115 |
+
|
116 |
+
def remove_punctuation(common_ngrams):
|
117 |
+
punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
|
118 |
+
for item in common_ngrams:
|
119 |
+
if item in punctuation:
|
120 |
+
common_ngrams.remove(item)
|
121 |
+
return common_ngrams
|
122 |
+
|
123 |
+
final_list = remove_punctuation(common_ngrams)
|
124 |
+
sentence = sent_list[0]
|
125 |
+
non_melting_points = find_relative_order(sentence, final_list)
|
126 |
+
|
127 |
+
return non_melting_points
|
128 |
+
|
129 |
+
|
130 |
+
# Example usage
|
131 |
+
# from paraphraser import generate_paraphrase
|
132 |
+
# from twokenize import tokenize_sentences
|
133 |
+
|
134 |
+
# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
|
135 |
+
# non_melting_points = find_non_melting_points(sentences)
|
136 |
+
|
137 |
+
# print(non_melting_points)
|
paraphraser.py
CHANGED
@@ -28,7 +28,7 @@ def generate_paraphrase(question):
|
|
28 |
res = paraphrase(question, para_tokenizer, para_model)
|
29 |
return res
|
30 |
|
31 |
-
# print(generate_paraphrase("
|
32 |
|
33 |
'''
|
34 |
Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
|
|
|
28 |
res = paraphrase(question, para_tokenizer, para_model)
|
29 |
return res
|
30 |
|
31 |
+
# print(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
|
32 |
|
33 |
'''
|
34 |
Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
|
twokenize.py
ADDED
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
|
5 |
+
This tokenizer code has gone through a long history:
|
6 |
+
|
7 |
+
(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
|
8 |
+
TweetMotif: Exploratory Search and Topic Summarization for Twitter.
|
9 |
+
Brendan O'Connor, Michel Krieger, and David Ahn.
|
10 |
+
ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
|
11 |
+
(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
|
12 |
+
(2b) Jason Baldridge and David Snyder ported it to Scala
|
13 |
+
(3) Brendan bugfixed the Scala port and merged with POS-specific changes
|
14 |
+
for the CMU ARK Twitter POS Tagger
|
15 |
+
(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
|
16 |
+
|
17 |
+
Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
|
18 |
+
|
19 |
+
There have been at least 2 other Java ports, but they are not in the lineage for the code here.
|
20 |
+
|
21 |
+
Ported to Python by Myle Ott <[email protected]>.
|
22 |
+
"""
|
23 |
+
from __future__ import unicode_literals
|
24 |
+
|
25 |
+
import operator
|
26 |
+
import re
|
27 |
+
import sys
|
28 |
+
|
29 |
+
try:
|
30 |
+
from html.parser import HTMLParser
|
31 |
+
except ImportError:
|
32 |
+
from HTMLParser import HTMLParser
|
33 |
+
|
34 |
+
try:
|
35 |
+
import html
|
36 |
+
except ImportError:
|
37 |
+
pass
|
38 |
+
|
39 |
+
def regex_or(*items):
|
40 |
+
return '(?:' + '|'.join(items) + ')'
|
41 |
+
|
42 |
+
Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
|
43 |
+
Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
|
44 |
+
|
45 |
+
punctChars = r"['\"“”‘’.?!…,:;]"
|
46 |
+
#punctSeq = punctChars+"+" #'anthem'. => ' anthem '.
|
47 |
+
punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' .
|
48 |
+
entity = r"&(?:amp|lt|gt|quot);"
|
49 |
+
# URLs
|
50 |
+
|
51 |
+
|
52 |
+
# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
|
53 |
+
# If you actually empirically test it the results are bad.
|
54 |
+
# Please see https://github.com/brendano/ark-tweet-nlp/pull/9
|
55 |
+
|
56 |
+
urlStart1 = r"(?:https?://|\bwww\.)"
|
57 |
+
commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
|
58 |
+
ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
|
59 |
+
r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
|
60 |
+
r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
|
61 |
+
r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
|
62 |
+
r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
|
63 |
+
r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
|
64 |
+
r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
|
65 |
+
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
|
66 |
+
urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
|
67 |
+
urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
|
68 |
+
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
|
69 |
+
urlEnd = r"(?:\.\.+|[<>]|\s|$)"
|
70 |
+
url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
|
71 |
+
|
72 |
+
|
73 |
+
# Numeric
|
74 |
+
timeLike = r"\d+(?::\d+){1,2}"
|
75 |
+
#numNum = r"\d+\.\d+"
|
76 |
+
numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
|
77 |
+
numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?"
|
78 |
+
|
79 |
+
# Abbreviations
|
80 |
+
boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
|
81 |
+
aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
|
82 |
+
aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
|
83 |
+
standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
|
84 |
+
arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
|
85 |
+
separators = "(?:--+|―|—|~|–|=)"
|
86 |
+
decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)"
|
87 |
+
thingsThatSplitWords = r"[^\s\.,?\"]"
|
88 |
+
embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
|
89 |
+
|
90 |
+
# Emoticons
|
91 |
+
# myleott: in Python the (?iu) flags affect the whole expression
|
92 |
+
#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
|
93 |
+
normalEyes = "[:=]" # 8 and x are eyes but cause problems
|
94 |
+
wink = "[;]"
|
95 |
+
noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
|
96 |
+
happyMouths = r"[D\)\]\}]+"
|
97 |
+
sadMouths = r"[\(\[\{]+"
|
98 |
+
tongue = "[pPd3]+"
|
99 |
+
otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
|
100 |
+
|
101 |
+
# mouth repetition examples:
|
102 |
+
# @aliciakeys Put it in a love song :-))
|
103 |
+
# @hellocalyclops =))=))=)) Oh well
|
104 |
+
|
105 |
+
# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
|
106 |
+
#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
|
107 |
+
bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)"
|
108 |
+
bfCenter = r"(?:[\.]|[_-]+)"
|
109 |
+
bfRight = r"\2"
|
110 |
+
s3 = r"(?:--['\"])"
|
111 |
+
s4 = r"(?:<|<|>|>)[\._-]+(?:<|<|>|>)"
|
112 |
+
s5 = "(?:[.][_]+[.])"
|
113 |
+
# myleott: in Python the (?i) flag affects the whole expression
|
114 |
+
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
|
115 |
+
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
|
116 |
+
|
117 |
+
eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
|
118 |
+
eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+"
|
119 |
+
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
|
120 |
+
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
|
121 |
+
|
122 |
+
oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
|
123 |
+
|
124 |
+
|
125 |
+
emoticon = regex_or(
|
126 |
+
# Standard version :) :( :] :D :P
|
127 |
+
"(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
|
128 |
+
|
129 |
+
# reversed version (: D: use positive lookbehind to remove "(word):"
|
130 |
+
# because eyes on the right side is more ambiguous with the standard usage of : ;
|
131 |
+
regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?",
|
132 |
+
|
133 |
+
#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
|
134 |
+
eastEmote.replace("2", "1", 1), basicface,
|
135 |
+
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
|
136 |
+
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
|
137 |
+
|
138 |
+
# myleott: o.O and O.o are two of the biggest sources of differences
|
139 |
+
# between this and the Java version. One little hack won't hurt...
|
140 |
+
oOEmote
|
141 |
+
)
|
142 |
+
|
143 |
+
Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
|
144 |
+
|
145 |
+
Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+")
|
146 |
+
|
147 |
+
# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
|
148 |
+
# "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
|
149 |
+
# "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
|
150 |
+
# "hello (@person)" ==> "hello (@person )" WRONG
|
151 |
+
# "hello (@person)" ==> "hello ( @person )" RIGHT
|
152 |
+
# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
|
153 |
+
# has poor content-symbol detection.
|
154 |
+
|
155 |
+
# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
|
156 |
+
# If you want good hashtag identification, use a different regex.
|
157 |
+
Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b
|
158 |
+
#optional: lookbehind for \b, max length 15
|
159 |
+
AtMention = "[@@][a-zA-Z0-9_]+"
|
160 |
+
|
161 |
+
# I was worried this would conflict with at-mentions
|
162 |
+
# but seems ok in sample of 5800: 7 changes all email fixes
|
163 |
+
# http://www.regular-expressions.info/email.html
|
164 |
+
Bound = r"(?:\W|^|$)"
|
165 |
+
Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
|
166 |
+
|
167 |
+
# We will be tokenizing using these regexps as delimiters
|
168 |
+
# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
|
169 |
+
Protected = re.compile(
|
170 |
+
regex_or(
|
171 |
+
Hearts,
|
172 |
+
url,
|
173 |
+
Email,
|
174 |
+
timeLike,
|
175 |
+
#numNum,
|
176 |
+
numberWithCommas,
|
177 |
+
numComb,
|
178 |
+
emoticon,
|
179 |
+
Arrows,
|
180 |
+
entity,
|
181 |
+
punctSeq,
|
182 |
+
arbitraryAbbrev,
|
183 |
+
separators,
|
184 |
+
decorations,
|
185 |
+
embeddedApostrophe,
|
186 |
+
Hashtag,
|
187 |
+
AtMention), re.UNICODE)
|
188 |
+
|
189 |
+
# Edge punctuation
|
190 |
+
# Want: 'foo' => ' foo '
|
191 |
+
# While also: don't => don't
|
192 |
+
# the first is considered "edge punctuation".
|
193 |
+
# the second is word-internal punctuation -- don't want to mess with it.
|
194 |
+
# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
|
195 |
+
# I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
|
196 |
+
|
197 |
+
# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
|
198 |
+
#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
|
199 |
+
edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
|
200 |
+
edgePunct = "[" + edgePunctChars + "]"
|
201 |
+
notEdgePunct = "[a-zA-Z0-9]" # content characters
|
202 |
+
offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
|
203 |
+
EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
|
204 |
+
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
|
205 |
+
|
206 |
+
def splitEdgePunct(input):
|
207 |
+
input = EdgePunctLeft.sub(r"\1\2 \3", input)
|
208 |
+
input = EdgePunctRight.sub(r"\1 \2\3", input)
|
209 |
+
return input
|
210 |
+
|
211 |
+
# The main work of tokenizing a tweet.
|
212 |
+
def simpleTokenize(text):
|
213 |
+
|
214 |
+
# Do the no-brainers first
|
215 |
+
splitPunctText = splitEdgePunct(text)
|
216 |
+
|
217 |
+
textLength = len(splitPunctText)
|
218 |
+
|
219 |
+
# BTO: the logic here got quite convoluted via the Scala porting detour
|
220 |
+
# It would be good to switch back to a nice simple procedural style like in the Python version
|
221 |
+
# ... Scala is such a pain. Never again.
|
222 |
+
|
223 |
+
# Find the matches for subsequences that should be protected,
|
224 |
+
# e.g. URLs, 1.0, U.N.K.L.E., 12:53
|
225 |
+
bads = []
|
226 |
+
badSpans = []
|
227 |
+
for match in Protected.finditer(splitPunctText):
|
228 |
+
# The spans of the "bads" should not be split.
|
229 |
+
if (match.start() != match.end()): #unnecessary?
|
230 |
+
bads.append( [splitPunctText[match.start():match.end()]] )
|
231 |
+
badSpans.append( (match.start(), match.end()) )
|
232 |
+
|
233 |
+
# Create a list of indices to create the "goods", which can be
|
234 |
+
# split. We are taking "bad" spans like
|
235 |
+
# List((2,5), (8,10))
|
236 |
+
# to create
|
237 |
+
# List(0, 2, 5, 8, 10, 12)
|
238 |
+
# where, e.g., "12" here would be the textLength
|
239 |
+
# has an even length and no indices are the same
|
240 |
+
indices = [0]
|
241 |
+
for (first, second) in badSpans:
|
242 |
+
indices.append(first)
|
243 |
+
indices.append(second)
|
244 |
+
indices.append(textLength)
|
245 |
+
|
246 |
+
# Group the indices and map them to their respective portion of the string
|
247 |
+
splitGoods = []
|
248 |
+
for i in range(0, len(indices), 2):
|
249 |
+
goodstr = splitPunctText[indices[i]:indices[i+1]]
|
250 |
+
splitstr = goodstr.strip().split(" ")
|
251 |
+
splitGoods.append(splitstr)
|
252 |
+
|
253 |
+
# Reinterpolate the 'good' and 'bad' Lists, ensuring that
|
254 |
+
# additonal tokens from last good item get included
|
255 |
+
zippedStr = []
|
256 |
+
for i in range(len(bads)):
|
257 |
+
zippedStr = addAllnonempty(zippedStr, splitGoods[i])
|
258 |
+
zippedStr = addAllnonempty(zippedStr, bads[i])
|
259 |
+
zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
|
260 |
+
|
261 |
+
# BTO: our POS tagger wants "ur" and "you're" to both be one token.
|
262 |
+
# Uncomment to get "you 're"
|
263 |
+
#splitStr = []
|
264 |
+
#for tok in zippedStr:
|
265 |
+
# splitStr.extend(splitToken(tok))
|
266 |
+
#zippedStr = splitStr
|
267 |
+
|
268 |
+
return zippedStr
|
269 |
+
|
270 |
+
def addAllnonempty(master, smaller):
|
271 |
+
for s in smaller:
|
272 |
+
strim = s.strip()
|
273 |
+
if (len(strim) > 0):
|
274 |
+
master.append(strim)
|
275 |
+
return master
|
276 |
+
|
277 |
+
# "foo bar " => "foo bar"
|
278 |
+
def squeezeWhitespace(input):
|
279 |
+
return Whitespace.sub(" ", input).strip()
|
280 |
+
|
281 |
+
# Final pass tokenization based on special patterns
|
282 |
+
def splitToken(token):
|
283 |
+
m = Contractions.search(token)
|
284 |
+
if m:
|
285 |
+
return [m.group(1), m.group(2)]
|
286 |
+
return [token]
|
287 |
+
|
288 |
+
# Assume 'text' has no HTML escaping.
|
289 |
+
def tokenize(text):
|
290 |
+
return simpleTokenize(squeezeWhitespace(text))
|
291 |
+
|
292 |
+
|
293 |
+
# Twitter text comes HTML-escaped, so unescape it.
|
294 |
+
# We also first unescape &'s, in case the text has been buggily double-escaped.
|
295 |
+
def normalizeTextForTagger(text):
|
296 |
+
assert sys.version_info[0] >= 3 and sys.version_info[1] > 3, 'Python version >3.3 required'
|
297 |
+
text = text.replace("&", "&")
|
298 |
+
text = html.unescape(text)
|
299 |
+
return text
|
300 |
+
|
301 |
+
# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
|
302 |
+
#
|
303 |
+
# This function normalizes the input text BEFORE calling the tokenizer.
|
304 |
+
# So the tokens you get back may not exactly correspond to
|
305 |
+
# substrings of the original text.
|
306 |
+
def tokenizeRawTweetText(text):
|
307 |
+
tokens = tokenize(normalizeTextForTagger(text))
|
308 |
+
return tokens
|
309 |
+
|
310 |
+
def tokenize_sentences(all_sentences):
|
311 |
+
sent_list = []
|
312 |
+
for sentence in all_sentences:
|
313 |
+
sent_list.append(' '.join(tokenizeRawTweetText(sentence)))
|
314 |
+
return sent_list
|
315 |
+
|
316 |
+
def tokenize_sentence(sentence):
|
317 |
+
return ' '.join(tokenizeRawTweetText(sentence))
|