Spaces:

jgyasu
/

aiisc-watermarking-model

Running

App Files Files Community

jgyasu commited on Dec 27, 2024

Commit

0840f0a

verified ·

1 Parent(s): 2471de4

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

app.py +15 -26
entailment.py +1 -1
highlighter.py +15 -1
lcs.py +3 -14
non_melting_points.py +137 -0
paraphraser.py +1 -1
twokenize.py +317 -0

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import gradio as gr
 import time
 from tree import generate_subplot1, generate_subplot2
 from paraphraser import generate_paraphrase
-from lcs import find_common_subsequences, find_common_gram_positions
 from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
 from entailment import analyze_entailment
 from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
@@ -14,7 +14,9 @@ from detectability import SentenceDetectabilityCalculator
 from distortion import SentenceDistortionCalculator
 from euclidean_distance import SentenceEuclideanDistanceCalculator
 from threeD_plot import gen_three_D_plot
-from sankey import generate_sankey_diagram
 class WatermarkingPipeline:
     def __init__(self):
@@ -39,7 +41,6 @@ class WatermarkingPipeline:
     def step1_paraphrasing(self, prompt, threshold=0.7):
         start_time = time.time()
-        # Existing step1 code...
         self.user_prompt = prompt
         self.paraphrased_sentences = generate_paraphrase(prompt)
         if self.paraphrased_sentences is None:
@@ -47,15 +48,17 @@ class WatermarkingPipeline:
         self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
             analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
-        self.common_grams = find_common_subsequences(self.user_prompt, self.selected_sentences)
-        self.subsequences = [subseq for _, subseq in self.common_grams]
-        self.common_grams_position = find_common_gram_positions(self.selected_sentences, self.subsequences)
-        colors = ["red", "blue", "brown", "green"]
-        def select_color():
-            return random.choice(colors)
-        highlight_info = [(word, select_color()) for _, word in self.common_grams]
         highlighted_user_prompt = highlight_common_words(
             self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
@@ -227,9 +230,6 @@ class WatermarkingPipeline:
         return three_D_plot, time_info
-    def step6_sankey(self):
-        return generate_sankey_diagram()
 def create_gradio_interface():
     pipeline = WatermarkingPipeline()
@@ -289,11 +289,6 @@ def create_gradio_interface():
         gr.Markdown("### 3D Visualization of Metrics")
         three_D_plot = gr.Plot()
         step5_time = gr.Textbox(label="Execution Time", interactive=False)
-        # Sankey Diagram
-        gr.Markdown("# Watermarking Pipeline Flow Visualization")
-        generate_button = gr.Button("Generate Sankey Diagram")
-        sankey_plot = gr.Plot()
         paraphrase_button.click(
             pipeline.step1_paraphrasing,
@@ -325,12 +320,6 @@ def create_gradio_interface():
             inputs=None,
             outputs=[three_D_plot, step5_time]
         )
-        generate_button.click(
-            pipeline.step6_sankey,
-            inputs=None,
-            outputs=sankey_plot
-        )
     return demo

 import time
 from tree import generate_subplot1, generate_subplot2
 from paraphraser import generate_paraphrase
+# from lcs import find_common_subsequences, find_common_gram_positions
 from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
 from entailment import analyze_entailment
 from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
 from distortion import SentenceDistortionCalculator
 from euclidean_distance import SentenceEuclideanDistanceCalculator
 from threeD_plot import gen_three_D_plot
+from twokenize import tokenize_sentences, tokenize_sentence
+from non_melting_points import find_non_melting_points
 class WatermarkingPipeline:
     def __init__(self):
     def step1_paraphrasing(self, prompt, threshold=0.7):
         start_time = time.time()
         self.user_prompt = prompt
         self.paraphrased_sentences = generate_paraphrase(prompt)
         if self.paraphrased_sentences is None:
         self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = \
             analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
+        self.user_prompt_tokenized = tokenize_sentence(self.user_prompt)
+        self.selected_sentences_tokenized = tokenize_sentences(self.selected_sentences)
+        self.discarded_sentences_tokenized = tokenize_sentences(self.discarded_sentences)
+        all_tokenized_sentences = []
+        all_tokenized_sentences.append(self.user_prompt_tokenized)
+        all_tokenized_sentences.extend(self.selected_sentences_tokenized)
+        all_tokenized_sentences.extend(self.discarded_sentences_tokenized)
+        self.common_grams = find_non_melting_points(all_tokenized_sentences)
         highlighted_user_prompt = highlight_common_words(
             self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt"
         return three_D_plot, time_info
 def create_gradio_interface():
     pipeline = WatermarkingPipeline()
         gr.Markdown("### 3D Visualization of Metrics")
         three_D_plot = gr.Plot()
         step5_time = gr.Textbox(label="Execution Time", interactive=False)
         paraphrase_button.click(
             pipeline.step1_paraphrasing,
             inputs=None,
             outputs=[three_D_plot, step5_time]
         )
     return demo

entailment.py CHANGED Viewed

@@ -28,4 +28,4 @@ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
     return all_sentences, selected_sentences, discarded_sentences
-# print(analyze_entailment("I love you", [""], 0.7))


28
29	return all_sentences, selected_sentences, discarded_sentences
30
31	+ # print(analyze_entailment("I love you", ["I like you", "I hate you"], 0.7))

highlighter.py CHANGED Viewed

@@ -85,6 +85,7 @@ def highlight_common_words_dict(common_words, sentences, title):
     </div>
     '''
 def reparaphrased_sentences_html(sentences):
     formatted_sentences = []
@@ -101,4 +102,17 @@ def reparaphrased_sentences_html(sentences):
         box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
         <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
-    '''

     </div>
     '''
 def reparaphrased_sentences_html(sentences):
     formatted_sentences = []
         box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
         <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
+    '''
+common_words = [(1, "highlight"), (2, "numbering")]
+sentences = ["This is a test to highlight words.", "Numbering is important for clarity."]
+# Test highlight_common_words
+highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
+print(highlighted_html)
+# Test highlight_common_words_dict
+sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8}
+highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
+print(highlighted_html_dict)

lcs.py CHANGED Viewed

@@ -71,20 +71,9 @@ def find_common_gram_positions(str_list, common_grams):
     return positions
-# # Example usage
-# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
-# str_list = [
-#     'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
-#     'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
-#     'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
-#     'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
-#     'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
-#     'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
-#     'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
-#     'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
-#     'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
-#     'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
-# ]
 # # Find common subsequences
 # common_grams = find_common_subsequences(sentence, str_list)

     return positions
+# Example usage
+# sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."
+# str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.']
 # # Find common subsequences
 # common_grams = find_common_subsequences(sentence, str_list)

non_melting_points.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import nltk
+from nltk.corpus import stopwords
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+def remove_stopwords(text):
+    """
+    Remove stopwords using NLTK's stopword list
+    Args:
+        text (str): Input text
+    Returns:
+        str: Cleaned text with stopwords removed
+    """
+    stop_words = set(stopwords.words('english'))
+    words = text.lower().split()
+    return ' '.join([word for word in words if word not in stop_words])
+def is_exact_match(ngram, sentences):
+    """
+    Check if the given n-gram has an exact match in all sentences
+    Args:
+        ngram (str): The n-gram to search for
+        sentences (list): List of sentences to search in
+    Returns:
+        bool: True if n-gram has exact match in all sentences, False otherwise
+    """
+    sentence_ngrams = []
+    for sentence in sentences:
+        words = sentence.split()
+        current_ngrams = []
+        n = len(ngram.split())
+        for i in range(len(words) - n + 1):
+            current_ngram = " ".join(words[i:i+n])
+            current_ngrams.append(current_ngram)
+        sentence_ngrams.append(set(current_ngrams))
+    return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)
+def is_substring_of_any(ngram, common_ngrams):
+    """
+    Check if the given n-gram is an exact substring of any previously found common n-grams
+    Args:
+        ngram (str): The n-gram to check
+        common_ngrams (list): List of previously found common n-grams
+    Returns:
+        bool: True if ngram is a substring of any common_ngrams, False otherwise
+    """
+    ngram_words = ngram.split()
+    for common_gram in common_ngrams:
+        common_words = common_gram.split()
+        for i in range(len(common_words) - len(ngram_words) + 1):
+            if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
+                return True
+    return False
+def find_filtered_ngrams(sentences):
+    """
+    Find all n-grams that have exact matches across all sentences,
+    excluding those that are part of larger common n-grams
+    Args:
+        sentences (list): List of sentences to analyze
+    Returns:
+        list: List of all common n-grams in order of their appearance in the first sentence
+    """
+    # First, remove stopwords from all sentences
+    cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
+    words = cleaned_sentences[0].split()
+    max_n = len(words)
+    all_common_ngrams = []
+    for n in range(max_n, 0, -1):
+        for i in range(len(words) - n + 1):
+            ngram = " ".join(words[i:i+n])
+            if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
+                all_common_ngrams.append(ngram)
+    return all_common_ngrams
+def find_relative_order(sentence, common_ngrams):
+    sentence = sentence.lower()
+    ngram_positions = {}
+    for ngram in common_ngrams:
+        ngram_lower = ngram.lower()
+        if ngram_lower in sentence:
+            position = sentence.index(ngram_lower)
+            ngram_positions[ngram] = position
+    sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
+    result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
+    return result
+def find_non_melting_points(sent_list):
+    # Find filtered n-grams
+    common_ngrams = find_filtered_ngrams(sent_list)
+    def remove_punctuation(common_ngrams):
+      punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
+      for item in common_ngrams:
+        if item in punctuation:
+          common_ngrams.remove(item)
+      return common_ngrams
+    final_list = remove_punctuation(common_ngrams)
+    sentence = sent_list[0]
+    non_melting_points = find_relative_order(sentence, final_list)
+    return non_melting_points
+# Example usage
+# from paraphraser import generate_paraphrase
+# from twokenize import tokenize_sentences
+# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
+# non_melting_points = find_non_melting_points(sentences)
+# print(non_melting_points)

paraphraser.py CHANGED Viewed

@@ -28,7 +28,7 @@ def generate_paraphrase(question):
     res = paraphrase(question, para_tokenizer, para_model)
     return res
-# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
 '''
 Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.

     res = paraphrase(question, para_tokenizer, para_model)
     return res
+# print(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
 '''
 Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.

twokenize.py ADDED Viewed

	@@ -0,0 +1,317 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
+This tokenizer code has gone through a long history:
+(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
+       TweetMotif: Exploratory Search and Topic Summarization for Twitter.
+       Brendan O'Connor, Michel Krieger, and David Ahn.
+       ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
+(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
+(2b) Jason Baldridge and David Snyder ported it to Scala
+(3) Brendan bugfixed the Scala port and merged with POS-specific changes
+    for the CMU ARK Twitter POS Tagger
+(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
+Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
+There have been at least 2 other Java ports, but they are not in the lineage for the code here.
+Ported to Python by Myle Ott <[email protected]>.
+"""
+from __future__ import unicode_literals
+import operator
+import re
+import sys
+try:
+    from html.parser import HTMLParser
+except ImportError:
+    from HTMLParser import HTMLParser
+try:
+    import html
+except ImportError:
+    pass
+def regex_or(*items):
+    return '(?:' + '|'.join(items) + ')'
+Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
+Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
+punctChars = r"['\"“”‘’.?!…,:;]"
+#punctSeq   = punctChars+"+"	#'anthem'. => ' anthem '.
+punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"	#'anthem'. => ' anthem ' .
+entity     = r"&(?:amp|lt|gt|quot);"
+#  URLs
+# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
+# If you actually empirically test it the results are bad.
+# Please see https://github.com/brendano/ark-tweet-nlp/pull/9
+urlStart1  = r"(?:https?://|\bwww\.)"
+commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
+ccTLDs	 = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
+r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
+r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
+r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
+r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
+r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
+r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
+r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"	#TODO: remove obscure country domains?
+urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
+urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
+urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
+urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
+url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
+# Numeric
+timeLike   = r"\d+(?::\d+){1,2}"
+#numNum     = r"\d+\.\d+"
+numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
+numComb	 = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?"
+# Abbreviations
+boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
+aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
+aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
+standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
+arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
+separators  = "(?:--+|―|—|~|–|=)"
+decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)"
+thingsThatSplitWords = r"[^\s\.,?\"]"
+embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
+#  Emoticons
+# myleott: in Python the (?iu) flags affect the whole expression
+#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
+normalEyes = "[:=]" # 8 and x are eyes but cause problems
+wink = "[;]"
+noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
+happyMouths = r"[D\)\]\}]+"
+sadMouths = r"[\(\[\{]+"
+tongue = "[pPd3]+"
+otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
+# mouth repetition examples:
+# @aliciakeys Put it in a love song :-))
+# @hellocalyclops =))=))=)) Oh well
+# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
+#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
+bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)"
+bfCenter = r"(?:[\.]|[_-]+)"
+bfRight = r"\2"
+s3 = r"(?:--['\"])"
+s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
+s5 = "(?:[.][_]+[.])"
+# myleott: in Python the (?i) flag affects the whole expression
+#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
+eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+"
+eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
+eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
+oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
+emoticon = regex_or(
+        # Standard version  :) :( :] :D :P
+        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
+        # reversed version (: D:  use positive lookbehind to remove "(word):"
+        # because eyes on the right side is more ambiguous with the standard usage of : ;
+        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
+        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
+        eastEmote.replace("2", "1", 1), basicface,
+        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
+        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
+        # myleott: o.O and O.o are two of the biggest sources of differences
+        #          between this and the Java version. One little hack won't hurt...
+        oOEmote
+)
+Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
+Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+")
+# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
+# "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
+# "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
+# "hello (@person)" ==> "hello (@person )"  WRONG
+# "hello (@person)" ==> "hello ( @person )"  RIGHT
+# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
+# has poor content-symbol detection.
+# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
+# If you want good hashtag identification, use a different regex.
+Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
+#optional: lookbehind for \b, max length 15
+AtMention = "[@＠][a-zA-Z0-9_]+"
+# I was worried this would conflict with at-mentions
+# but seems ok in sample of 5800: 7 changes all email fixes
+# http://www.regular-expressions.info/email.html
+Bound = r"(?:\W|^|$)"
+Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
+# We will be tokenizing using these regexps as delimiters
+# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
+Protected  = re.compile(
+    regex_or(
+        Hearts,
+        url,
+        Email,
+        timeLike,
+        #numNum,
+        numberWithCommas,
+        numComb,
+        emoticon,
+        Arrows,
+        entity,
+        punctSeq,
+        arbitraryAbbrev,
+        separators,
+        decorations,
+        embeddedApostrophe,
+        Hashtag,
+        AtMention), re.UNICODE)
+# Edge punctuation
+# Want: 'foo' => ' foo '
+# While also:   don't => don't
+# the first is considered "edge punctuation".
+# the second is word-internal punctuation -- don't want to mess with it.
+# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
+# I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
+# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
+#edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
+edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
+edgePunct    = "[" + edgePunctChars + "]"
+notEdgePunct = "[a-zA-Z0-9]" # content characters
+offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
+EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
+EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
+def splitEdgePunct(input):
+    input = EdgePunctLeft.sub(r"\1\2 \3", input)
+    input = EdgePunctRight.sub(r"\1 \2\3", input)
+    return input
+# The main work of tokenizing a tweet.
+def simpleTokenize(text):
+    # Do the no-brainers first
+    splitPunctText = splitEdgePunct(text)
+    textLength = len(splitPunctText)
+    # BTO: the logic here got quite convoluted via the Scala porting detour
+    # It would be good to switch back to a nice simple procedural style like in the Python version
+    # ... Scala is such a pain.  Never again.
+    # Find the matches for subsequences that should be protected,
+    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
+    bads = []
+    badSpans = []
+    for match in Protected.finditer(splitPunctText):
+        # The spans of the "bads" should not be split.
+        if (match.start() != match.end()): #unnecessary?
+            bads.append( [splitPunctText[match.start():match.end()]] )
+            badSpans.append( (match.start(), match.end()) )
+    # Create a list of indices to create the "goods", which can be
+    # split. We are taking "bad" spans like
+    #     List((2,5), (8,10))
+    # to create
+    #     List(0, 2, 5, 8, 10, 12)
+    # where, e.g., "12" here would be the textLength
+    # has an even length and no indices are the same
+    indices = [0]
+    for (first, second) in badSpans:
+        indices.append(first)
+        indices.append(second)
+    indices.append(textLength)
+    # Group the indices and map them to their respective portion of the string
+    splitGoods = []
+    for i in range(0, len(indices), 2):
+        goodstr = splitPunctText[indices[i]:indices[i+1]]
+        splitstr = goodstr.strip().split(" ")
+        splitGoods.append(splitstr)
+    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
+    #  additonal tokens from last good item get included
+    zippedStr = []
+    for i in range(len(bads)):
+        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
+        zippedStr = addAllnonempty(zippedStr, bads[i])
+    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
+    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
+    # Uncomment to get "you 're"
+    #splitStr = []
+    #for tok in zippedStr:
+    #    splitStr.extend(splitToken(tok))
+    #zippedStr = splitStr
+    return zippedStr
+def addAllnonempty(master, smaller):
+    for s in smaller:
+        strim = s.strip()
+        if (len(strim) > 0):
+            master.append(strim)
+    return master
+# "foo   bar " => "foo bar"
+def squeezeWhitespace(input):
+    return Whitespace.sub(" ", input).strip()
+# Final pass tokenization based on special patterns
+def splitToken(token):
+    m = Contractions.search(token)
+    if m:
+        return [m.group(1), m.group(2)]
+    return [token]
+# Assume 'text' has no HTML escaping.
+def tokenize(text):
+    return simpleTokenize(squeezeWhitespace(text))
+# Twitter text comes HTML-escaped, so unescape it.
+# We also first unescape &amp;'s, in case the text has been buggily double-escaped.
+def normalizeTextForTagger(text):
+    assert sys.version_info[0] >= 3 and sys.version_info[1] > 3, 'Python version >3.3 required'
+    text = text.replace("&amp;", "&")
+    text = html.unescape(text)
+    return text
+# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
+#
+# This function normalizes the input text BEFORE calling the tokenizer.
+# So the tokens you get back may not exactly correspond to
+# substrings of the original text.
+def tokenizeRawTweetText(text):
+    tokens = tokenize(normalizeTextForTagger(text))
+    return tokens
+def tokenize_sentences(all_sentences):
+    sent_list = []
+    for sentence in all_sentences:
+        sent_list.append(' '.join(tokenizeRawTweetText(sentence)))
+    return sent_list
+def tokenize_sentence(sentence):
+    return ' '.join(tokenizeRawTweetText(sentence))