samiulhaq
/

t-uren-iwslt

+#
+#  Copyright (c) 2013-present, Anoop Kunchukuttan
+#  All rights reserved.
+#
+#  This source code is licensed under the MIT license found in the
+#  LICENSE file in the root directory of this source tree.
+#
+#Program for tokenizing Indian language input
+#
+# @author Anoop Kunchukuttan
+#
+"""
+Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
+are supported (see `trivial_tokenize`). Major Indian language punctuations are
+handled.
+"""
+import string, re, sys
+from indicnlp.common import IndicNlpException
+### tokenizer patterns
+triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
+triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')
+## date, numbers, section/article numbering
+pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
+def trivial_tokenize_indic(text):
+    """tokenize string for Indian language scripts using Brahmi-derived scripts
+    A trivial tokenizer which just tokenizes on the punctuation boundaries.
+    This also includes punctuations for the Indian language scripts (the
+    purna virama and the deergha virama). This is a language independent
+    tokenizer
+    Args:
+        text (str): text to tokenize
+    Returns:
+        list: list of tokens
+    """
+    tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
+#     return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
+    s=re.sub(r'[ ]+',' ',tok_str).strip(' ')
+    # do not tokenize numbers and dates
+    new_s=''
+    prev=0
+    for m in pat_num_seq.finditer(s):
+        start=m.start()
+        end=m.end()
+        if start>prev:
+            new_s=new_s+s[prev:start]
+            new_s=new_s+s[start:end].replace(' ','')
+            prev=end
+    new_s=new_s+s[prev:]
+    s=new_s
+    return s.split(' ')
+def trivial_tokenize_urdu(text):
+    """tokenize Urdu string
+    A trivial tokenizer which just tokenizes on the punctuation boundaries.
+    This also includes punctuations for the Urdu script.
+    These punctuations characters were identified from the Unicode database
+    for Arabic script by looking for punctuation symbols.
+    Args:
+        text (str): text to tokenize
+    Returns:
+        list: list of tokens
+    """
+    tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
+    return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
+def trivial_tokenize(text,lang='hi'):
+    """trivial tokenizer for Indian languages using Brahmi for Arabic scripts
+    A trivial tokenizer which just tokenizes on the punctuation boundaries.
+    Major punctuations specific to Indian langauges are handled.
+    These punctuations characters were identified from the Unicode database.
+    Args:
+        text (str): text to tokenize
+        lang (str): ISO 639-2 language code
+    Returns:
+        list: list of tokens
+    """
+    if lang=='ur':
+        return trivial_tokenize_urdu(text)
+    else:
+        return trivial_tokenize_indic(text)
+# if __name__ == '__main__':
+#     if len(sys.argv)<4:
+#         print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
+#         sys.exit(1)
+#     with open(sys.argv[1],'r', encoding='utf-8') as ifile:
+#         with open(sys.argv[2],'w', encoding='utf-8') as ofile:
+#             for line in ifile:
+#                 tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
+#                 ofile.write(tokenized_line)