jgyasu commited on
Commit
0840f0a
·
verified ·
1 Parent(s): 2471de4

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. app.py +15 -26
  2. entailment.py +1 -1
  3. highlighter.py +15 -1
  4. lcs.py +3 -14
  5. non_melting_points.py +137 -0
  6. paraphraser.py +1 -1
  7. twokenize.py +317 -0
app.py CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
5
  import time
6
  from tree import generate_subplot1, generate_subplot2
7
  from paraphraser import generate_paraphrase
8
- from lcs import find_common_subsequences, find_common_gram_positions
9
  from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
10
  from entailment import analyze_entailment
11
  from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
@@ -14,7 +14,9 @@ from detectability import SentenceDetectabilityCalculator
14
  from distortion import SentenceDistortionCalculator
15
  from euclidean_distance import SentenceEuclideanDistanceCalculator
16
  from threeD_plot import gen_three_D_plot
17
- from sankey import generate_sankey_diagram
 
 
18
 
19
  class WatermarkingPipeline:
20
  def __init__(self):
@@ -39,7 +41,6 @@ class WatermarkingPipeline:
39
  def step1_paraphrasing(self, prompt, threshold=0.7):
40
  start_time = time.time()
41
 
42
- # Existing step1 code...
43
  self.user_prompt = prompt
44
  self.paraphrased_sentences = generate_paraphrase(prompt)
45
  if self.paraphrased_sentences is None:
@@ -47,15 +48,17 @@ class WatermarkingPipeline:
47
 
48
  self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
49
  analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
50
-
51
- self.common_grams = find_common_subsequences(self.user_prompt, self.selected_sentences)
52
- self.subsequences = [subseq for _, subseq in self.common_grams]
53
- self.common_grams_position = find_common_gram_positions(self.selected_sentences, self.subsequences)
54
-
55
- colors = ["red", "blue", "brown", "green"]
56
- def select_color():
57
- return random.choice(colors)
58
- highlight_info = [(word, select_color()) for _, word in self.common_grams]
 
 
59
 
60
  highlighted_user_prompt = highlight_common_words(
61
  self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
@@ -227,9 +230,6 @@ class WatermarkingPipeline:
227
 
228
  return three_D_plot, time_info
229
 
230
- def step6_sankey(self):
231
- return generate_sankey_diagram()
232
-
233
  def create_gradio_interface():
234
  pipeline = WatermarkingPipeline()
235
 
@@ -289,11 +289,6 @@ def create_gradio_interface():
289
  gr.Markdown("### 3D Visualization of Metrics")
290
  three_D_plot = gr.Plot()
291
  step5_time = gr.Textbox(label="Execution Time", interactive=False)
292
-
293
- # Sankey Diagram
294
- gr.Markdown("# Watermarking Pipeline Flow Visualization")
295
- generate_button = gr.Button("Generate Sankey Diagram")
296
- sankey_plot = gr.Plot()
297
 
298
  paraphrase_button.click(
299
  pipeline.step1_paraphrasing,
@@ -325,12 +320,6 @@ def create_gradio_interface():
325
  inputs=None,
326
  outputs=[three_D_plot, step5_time]
327
  )
328
-
329
- generate_button.click(
330
- pipeline.step6_sankey,
331
- inputs=None,
332
- outputs=sankey_plot
333
- )
334
 
335
  return demo
336
 
 
5
  import time
6
  from tree import generate_subplot1, generate_subplot2
7
  from paraphraser import generate_paraphrase
8
+ # from lcs import find_common_subsequences, find_common_gram_positions
9
  from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
10
  from entailment import analyze_entailment
11
  from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
 
14
  from distortion import SentenceDistortionCalculator
15
  from euclidean_distance import SentenceEuclideanDistanceCalculator
16
  from threeD_plot import gen_three_D_plot
17
+
18
+ from twokenize import tokenize_sentences, tokenize_sentence
19
+ from non_melting_points import find_non_melting_points
20
 
21
  class WatermarkingPipeline:
22
  def __init__(self):
 
41
  def step1_paraphrasing(self, prompt, threshold=0.7):
42
  start_time = time.time()
43
 
 
44
  self.user_prompt = prompt
45
  self.paraphrased_sentences = generate_paraphrase(prompt)
46
  if self.paraphrased_sentences is None:
 
48
 
49
  self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
50
  analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
51
+
52
+ self.user_prompt_tokenized = tokenize_sentence(self.user_prompt)
53
+ self.selected_sentences_tokenized = tokenize_sentences(self.selected_sentences)
54
+ self.discarded_sentences_tokenized = tokenize_sentences(self.discarded_sentences)
55
+
56
+ all_tokenized_sentences = []
57
+ all_tokenized_sentences.append(self.user_prompt_tokenized)
58
+ all_tokenized_sentences.extend(self.selected_sentences_tokenized)
59
+ all_tokenized_sentences.extend(self.discarded_sentences_tokenized)
60
+
61
+ self.common_grams = find_non_melting_points(all_tokenized_sentences)
62
 
63
  highlighted_user_prompt = highlight_common_words(
64
  self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
 
230
 
231
  return three_D_plot, time_info
232
 
 
 
 
233
  def create_gradio_interface():
234
  pipeline = WatermarkingPipeline()
235
 
 
289
  gr.Markdown("### 3D Visualization of Metrics")
290
  three_D_plot = gr.Plot()
291
  step5_time = gr.Textbox(label="Execution Time", interactive=False)
 
 
 
 
 
292
 
293
  paraphrase_button.click(
294
  pipeline.step1_paraphrasing,
 
320
  inputs=None,
321
  outputs=[three_D_plot, step5_time]
322
  )
 
 
 
 
 
 
323
 
324
  return demo
325
 
entailment.py CHANGED
@@ -28,4 +28,4 @@ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
28
 
29
  return all_sentences, selected_sentences, discarded_sentences
30
 
31
- # print(analyze_entailment("I love you", [""], 0.7))
 
28
 
29
  return all_sentences, selected_sentences, discarded_sentences
30
 
31
+ # print(analyze_entailment("I love you", ["I like you", "I hate you"], 0.7))
highlighter.py CHANGED
@@ -85,6 +85,7 @@ def highlight_common_words_dict(common_words, sentences, title):
85
  </div>
86
  '''
87
 
 
88
  def reparaphrased_sentences_html(sentences):
89
 
90
  formatted_sentences = []
@@ -101,4 +102,17 @@ def reparaphrased_sentences_html(sentences):
101
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
102
  <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
103
  </div>
104
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  </div>
86
  '''
87
 
88
+
89
  def reparaphrased_sentences_html(sentences):
90
 
91
  formatted_sentences = []
 
102
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
103
  <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
104
  </div>
105
+ '''
106
+
107
+
108
+ common_words = [(1, "highlight"), (2, "numbering")]
109
+ sentences = ["This is a test to highlight words.", "Numbering is important for clarity."]
110
+
111
+ # Test highlight_common_words
112
+ highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
113
+ print(highlighted_html)
114
+
115
+ # Test highlight_common_words_dict
116
+ sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8}
117
+ highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
118
+ print(highlighted_html_dict)
lcs.py CHANGED
@@ -71,20 +71,9 @@ def find_common_gram_positions(str_list, common_grams):
71
  return positions
72
 
73
 
74
- # # Example usage
75
- # sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
76
- # str_list = [
77
- # 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
78
- # 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
79
- # 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
80
- # 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
81
- # 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
82
- # 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
83
- # 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
84
- # 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
85
- # 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
86
- # 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
87
- # ]
88
 
89
  # # Find common subsequences
90
  # common_grams = find_common_subsequences(sentence, str_list)
 
71
  return positions
72
 
73
 
74
+ # Example usage
75
+ # sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."
76
+ # str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.']
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # # Find common subsequences
79
  # common_grams = find_common_subsequences(sentence, str_list)
non_melting_points.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+
4
+ try:
5
+ nltk.data.find('corpora/stopwords')
6
+ except LookupError:
7
+ nltk.download('stopwords')
8
+
9
+ def remove_stopwords(text):
10
+ """
11
+ Remove stopwords using NLTK's stopword list
12
+
13
+ Args:
14
+ text (str): Input text
15
+
16
+ Returns:
17
+ str: Cleaned text with stopwords removed
18
+ """
19
+ stop_words = set(stopwords.words('english'))
20
+ words = text.lower().split()
21
+ return ' '.join([word for word in words if word not in stop_words])
22
+
23
+ def is_exact_match(ngram, sentences):
24
+ """
25
+ Check if the given n-gram has an exact match in all sentences
26
+
27
+ Args:
28
+ ngram (str): The n-gram to search for
29
+ sentences (list): List of sentences to search in
30
+
31
+ Returns:
32
+ bool: True if n-gram has exact match in all sentences, False otherwise
33
+ """
34
+ sentence_ngrams = []
35
+ for sentence in sentences:
36
+ words = sentence.split()
37
+ current_ngrams = []
38
+ n = len(ngram.split())
39
+
40
+ for i in range(len(words) - n + 1):
41
+ current_ngram = " ".join(words[i:i+n])
42
+ current_ngrams.append(current_ngram)
43
+
44
+ sentence_ngrams.append(set(current_ngrams))
45
+
46
+ return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)
47
+
48
+ def is_substring_of_any(ngram, common_ngrams):
49
+ """
50
+ Check if the given n-gram is an exact substring of any previously found common n-grams
51
+
52
+ Args:
53
+ ngram (str): The n-gram to check
54
+ common_ngrams (list): List of previously found common n-grams
55
+
56
+ Returns:
57
+ bool: True if ngram is a substring of any common_ngrams, False otherwise
58
+ """
59
+ ngram_words = ngram.split()
60
+ for common_gram in common_ngrams:
61
+ common_words = common_gram.split()
62
+ for i in range(len(common_words) - len(ngram_words) + 1):
63
+ if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
64
+ return True
65
+ return False
66
+
67
+ def find_filtered_ngrams(sentences):
68
+ """
69
+ Find all n-grams that have exact matches across all sentences,
70
+ excluding those that are part of larger common n-grams
71
+
72
+ Args:
73
+ sentences (list): List of sentences to analyze
74
+
75
+ Returns:
76
+ list: List of all common n-grams in order of their appearance in the first sentence
77
+ """
78
+ # First, remove stopwords from all sentences
79
+ cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
80
+
81
+ words = cleaned_sentences[0].split()
82
+ max_n = len(words)
83
+ all_common_ngrams = []
84
+
85
+ for n in range(max_n, 0, -1):
86
+ for i in range(len(words) - n + 1):
87
+ ngram = " ".join(words[i:i+n])
88
+
89
+ if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
90
+ all_common_ngrams.append(ngram)
91
+
92
+ return all_common_ngrams
93
+
94
+ def find_relative_order(sentence, common_ngrams):
95
+ sentence = sentence.lower()
96
+ ngram_positions = {}
97
+
98
+ for ngram in common_ngrams:
99
+ ngram_lower = ngram.lower()
100
+ if ngram_lower in sentence:
101
+ position = sentence.index(ngram_lower)
102
+ ngram_positions[ngram] = position
103
+
104
+ sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
105
+
106
+ result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
107
+
108
+ return result
109
+
110
+
111
+ def find_non_melting_points(sent_list):
112
+
113
+ # Find filtered n-grams
114
+ common_ngrams = find_filtered_ngrams(sent_list)
115
+
116
+ def remove_punctuation(common_ngrams):
117
+ punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
118
+ for item in common_ngrams:
119
+ if item in punctuation:
120
+ common_ngrams.remove(item)
121
+ return common_ngrams
122
+
123
+ final_list = remove_punctuation(common_ngrams)
124
+ sentence = sent_list[0]
125
+ non_melting_points = find_relative_order(sentence, final_list)
126
+
127
+ return non_melting_points
128
+
129
+
130
+ # Example usage
131
+ # from paraphraser import generate_paraphrase
132
+ # from twokenize import tokenize_sentences
133
+
134
+ # sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
135
+ # non_melting_points = find_non_melting_points(sentences)
136
+
137
+ # print(non_melting_points)
paraphraser.py CHANGED
@@ -28,7 +28,7 @@ def generate_paraphrase(question):
28
  res = paraphrase(question, para_tokenizer, para_model)
29
  return res
30
 
31
- # print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
32
 
33
  '''
34
  Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
 
28
  res = paraphrase(question, para_tokenizer, para_model)
29
  return res
30
 
31
+ # print(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
32
 
33
  '''
34
  Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
twokenize.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
5
+ This tokenizer code has gone through a long history:
6
+
7
+ (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
8
+ TweetMotif: Exploratory Search and Topic Summarization for Twitter.
9
+ Brendan O'Connor, Michel Krieger, and David Ahn.
10
+ ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
11
+ (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
12
+ (2b) Jason Baldridge and David Snyder ported it to Scala
13
+ (3) Brendan bugfixed the Scala port and merged with POS-specific changes
14
+ for the CMU ARK Twitter POS Tagger
15
+ (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
16
+
17
+ Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
18
+
19
+ There have been at least 2 other Java ports, but they are not in the lineage for the code here.
20
+
21
+ Ported to Python by Myle Ott <[email protected]>.
22
+ """
23
+ from __future__ import unicode_literals
24
+
25
+ import operator
26
+ import re
27
+ import sys
28
+
29
+ try:
30
+ from html.parser import HTMLParser
31
+ except ImportError:
32
+ from HTMLParser import HTMLParser
33
+
34
+ try:
35
+ import html
36
+ except ImportError:
37
+ pass
38
+
39
+ def regex_or(*items):
40
+ return '(?:' + '|'.join(items) + ')'
41
+
42
+ Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
43
+ Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
44
+
45
+ punctChars = r"['\"“”‘’.?!…,:;]"
46
+ #punctSeq = punctChars+"+" #'anthem'. => ' anthem '.
47
+ punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' .
48
+ entity = r"&(?:amp|lt|gt|quot);"
49
+ # URLs
50
+
51
+
52
+ # BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
53
+ # If you actually empirically test it the results are bad.
54
+ # Please see https://github.com/brendano/ark-tweet-nlp/pull/9
55
+
56
+ urlStart1 = r"(?:https?://|\bwww\.)"
57
+ commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
58
+ ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
59
+ r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
60
+ r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
61
+ r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
62
+ r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
63
+ r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
64
+ r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
65
+ r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
66
+ urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
67
+ urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
68
+ urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
69
+ urlEnd = r"(?:\.\.+|[<>]|\s|$)"
70
+ url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
71
+
72
+
73
+ # Numeric
74
+ timeLike = r"\d+(?::\d+){1,2}"
75
+ #numNum = r"\d+\.\d+"
76
+ numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
77
+ numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?"
78
+
79
+ # Abbreviations
80
+ boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
81
+ aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
82
+ aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
83
+ standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
84
+ arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
85
+ separators = "(?:--+|―|—|~|–|=)"
86
+ decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)"
87
+ thingsThatSplitWords = r"[^\s\.,?\"]"
88
+ embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
89
+
90
+ # Emoticons
91
+ # myleott: in Python the (?iu) flags affect the whole expression
92
+ #normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
93
+ normalEyes = "[:=]" # 8 and x are eyes but cause problems
94
+ wink = "[;]"
95
+ noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
96
+ happyMouths = r"[D\)\]\}]+"
97
+ sadMouths = r"[\(\[\{]+"
98
+ tongue = "[pPd3]+"
99
+ otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
100
+
101
+ # mouth repetition examples:
102
+ # @aliciakeys Put it in a love song :-))
103
+ # @hellocalyclops =))=))=)) Oh well
104
+
105
+ # myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
106
+ #bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
107
+ bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)"
108
+ bfCenter = r"(?:[\.]|[_-]+)"
109
+ bfRight = r"\2"
110
+ s3 = r"(?:--['\"])"
111
+ s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
112
+ s5 = "(?:[.][_]+[.])"
113
+ # myleott: in Python the (?i) flag affects the whole expression
114
+ #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
115
+ basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
116
+
117
+ eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
118
+ eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+"
119
+ eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
120
+ eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
121
+
122
+ oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
123
+
124
+
125
+ emoticon = regex_or(
126
+ # Standard version :) :( :] :D :P
127
+ "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
128
+
129
+ # reversed version (: D: use positive lookbehind to remove "(word):"
130
+ # because eyes on the right side is more ambiguous with the standard usage of : ;
131
+ regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
132
+
133
+ #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
134
+ eastEmote.replace("2", "1", 1), basicface,
135
+ # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
136
+ # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
137
+
138
+ # myleott: o.O and O.o are two of the biggest sources of differences
139
+ # between this and the Java version. One little hack won't hurt...
140
+ oOEmote
141
+ )
142
+
143
+ Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
144
+
145
+ Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+")
146
+
147
+ # BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
148
+ # "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
149
+ # "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
150
+ # "hello (@person)" ==> "hello (@person )" WRONG
151
+ # "hello (@person)" ==> "hello ( @person )" RIGHT
152
+ # ... Some sort of weird interaction with edgepunct I guess, because edgepunct
153
+ # has poor content-symbol detection.
154
+
155
+ # This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
156
+ # If you want good hashtag identification, use a different regex.
157
+ Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b
158
+ #optional: lookbehind for \b, max length 15
159
+ AtMention = "[@@][a-zA-Z0-9_]+"
160
+
161
+ # I was worried this would conflict with at-mentions
162
+ # but seems ok in sample of 5800: 7 changes all email fixes
163
+ # http://www.regular-expressions.info/email.html
164
+ Bound = r"(?:\W|^|$)"
165
+ Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
166
+
167
+ # We will be tokenizing using these regexps as delimiters
168
+ # Additionally, these things are "protected", meaning they shouldn't be further split themselves.
169
+ Protected = re.compile(
170
+ regex_or(
171
+ Hearts,
172
+ url,
173
+ Email,
174
+ timeLike,
175
+ #numNum,
176
+ numberWithCommas,
177
+ numComb,
178
+ emoticon,
179
+ Arrows,
180
+ entity,
181
+ punctSeq,
182
+ arbitraryAbbrev,
183
+ separators,
184
+ decorations,
185
+ embeddedApostrophe,
186
+ Hashtag,
187
+ AtMention), re.UNICODE)
188
+
189
+ # Edge punctuation
190
+ # Want: 'foo' => ' foo '
191
+ # While also: don't => don't
192
+ # the first is considered "edge punctuation".
193
+ # the second is word-internal punctuation -- don't want to mess with it.
194
+ # BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
195
+ # I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
196
+
197
+ # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
198
+ #edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
199
+ edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
200
+ edgePunct = "[" + edgePunctChars + "]"
201
+ notEdgePunct = "[a-zA-Z0-9]" # content characters
202
+ offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
203
+ EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
204
+ EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
205
+
206
+ def splitEdgePunct(input):
207
+ input = EdgePunctLeft.sub(r"\1\2 \3", input)
208
+ input = EdgePunctRight.sub(r"\1 \2\3", input)
209
+ return input
210
+
211
+ # The main work of tokenizing a tweet.
212
+ def simpleTokenize(text):
213
+
214
+ # Do the no-brainers first
215
+ splitPunctText = splitEdgePunct(text)
216
+
217
+ textLength = len(splitPunctText)
218
+
219
+ # BTO: the logic here got quite convoluted via the Scala porting detour
220
+ # It would be good to switch back to a nice simple procedural style like in the Python version
221
+ # ... Scala is such a pain. Never again.
222
+
223
+ # Find the matches for subsequences that should be protected,
224
+ # e.g. URLs, 1.0, U.N.K.L.E., 12:53
225
+ bads = []
226
+ badSpans = []
227
+ for match in Protected.finditer(splitPunctText):
228
+ # The spans of the "bads" should not be split.
229
+ if (match.start() != match.end()): #unnecessary?
230
+ bads.append( [splitPunctText[match.start():match.end()]] )
231
+ badSpans.append( (match.start(), match.end()) )
232
+
233
+ # Create a list of indices to create the "goods", which can be
234
+ # split. We are taking "bad" spans like
235
+ # List((2,5), (8,10))
236
+ # to create
237
+ # List(0, 2, 5, 8, 10, 12)
238
+ # where, e.g., "12" here would be the textLength
239
+ # has an even length and no indices are the same
240
+ indices = [0]
241
+ for (first, second) in badSpans:
242
+ indices.append(first)
243
+ indices.append(second)
244
+ indices.append(textLength)
245
+
246
+ # Group the indices and map them to their respective portion of the string
247
+ splitGoods = []
248
+ for i in range(0, len(indices), 2):
249
+ goodstr = splitPunctText[indices[i]:indices[i+1]]
250
+ splitstr = goodstr.strip().split(" ")
251
+ splitGoods.append(splitstr)
252
+
253
+ # Reinterpolate the 'good' and 'bad' Lists, ensuring that
254
+ # additonal tokens from last good item get included
255
+ zippedStr = []
256
+ for i in range(len(bads)):
257
+ zippedStr = addAllnonempty(zippedStr, splitGoods[i])
258
+ zippedStr = addAllnonempty(zippedStr, bads[i])
259
+ zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
260
+
261
+ # BTO: our POS tagger wants "ur" and "you're" to both be one token.
262
+ # Uncomment to get "you 're"
263
+ #splitStr = []
264
+ #for tok in zippedStr:
265
+ # splitStr.extend(splitToken(tok))
266
+ #zippedStr = splitStr
267
+
268
+ return zippedStr
269
+
270
+ def addAllnonempty(master, smaller):
271
+ for s in smaller:
272
+ strim = s.strip()
273
+ if (len(strim) > 0):
274
+ master.append(strim)
275
+ return master
276
+
277
+ # "foo bar " => "foo bar"
278
+ def squeezeWhitespace(input):
279
+ return Whitespace.sub(" ", input).strip()
280
+
281
+ # Final pass tokenization based on special patterns
282
+ def splitToken(token):
283
+ m = Contractions.search(token)
284
+ if m:
285
+ return [m.group(1), m.group(2)]
286
+ return [token]
287
+
288
+ # Assume 'text' has no HTML escaping.
289
+ def tokenize(text):
290
+ return simpleTokenize(squeezeWhitespace(text))
291
+
292
+
293
+ # Twitter text comes HTML-escaped, so unescape it.
294
+ # We also first unescape &amp;'s, in case the text has been buggily double-escaped.
295
+ def normalizeTextForTagger(text):
296
+ assert sys.version_info[0] >= 3 and sys.version_info[1] > 3, 'Python version >3.3 required'
297
+ text = text.replace("&amp;", "&")
298
+ text = html.unescape(text)
299
+ return text
300
+
301
+ # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
302
+ #
303
+ # This function normalizes the input text BEFORE calling the tokenizer.
304
+ # So the tokens you get back may not exactly correspond to
305
+ # substrings of the original text.
306
+ def tokenizeRawTweetText(text):
307
+ tokens = tokenize(normalizeTextForTagger(text))
308
+ return tokens
309
+
310
+ def tokenize_sentences(all_sentences):
311
+ sent_list = []
312
+ for sentence in all_sentences:
313
+ sent_list.append(' '.join(tokenizeRawTweetText(sentence)))
314
+ return sent_list
315
+
316
+ def tokenize_sentence(sentence):
317
+ return ' '.join(tokenizeRawTweetText(sentence))