SAMI commited on
Commit
14b1d78
1 Parent(s): afb491e

commit from sami

Browse files
Files changed (2) hide show
  1. checkpoint10.pt +3 -0
  2. indic_tokenize.py.py +111 -0
checkpoint10.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:326ea3f506d52cbd6e75fd9951acde3347e8ad681f994cef3ea763964374a9fd
3
+ size 1052083291
indic_tokenize.py.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ #Program for tokenizing Indian language input
10
+ #
11
+ # @author Anoop Kunchukuttan
12
+ #
13
+ """
14
+ Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
15
+ are supported (see `trivial_tokenize`). Major Indian language punctuations are
16
+ handled.
17
+ """
18
+ import string, re, sys
19
+
20
+ from indicnlp.common import IndicNlpException
21
+
22
+ ### tokenizer patterns
23
+ triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
24
+ triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')
25
+
26
+ ## date, numbers, section/article numbering
27
+ pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
28
+
29
+ def trivial_tokenize_indic(text):
30
+ """tokenize string for Indian language scripts using Brahmi-derived scripts
31
+
32
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
33
+ This also includes punctuations for the Indian language scripts (the
34
+ purna virama and the deergha virama). This is a language independent
35
+ tokenizer
36
+
37
+ Args:
38
+ text (str): text to tokenize
39
+
40
+ Returns:
41
+ list: list of tokens
42
+
43
+ """
44
+ tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
45
+ # return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
46
+
47
+ s=re.sub(r'[ ]+',' ',tok_str).strip(' ')
48
+
49
+ # do not tokenize numbers and dates
50
+ new_s=''
51
+ prev=0
52
+ for m in pat_num_seq.finditer(s):
53
+ start=m.start()
54
+ end=m.end()
55
+ if start>prev:
56
+ new_s=new_s+s[prev:start]
57
+ new_s=new_s+s[start:end].replace(' ','')
58
+ prev=end
59
+
60
+ new_s=new_s+s[prev:]
61
+ s=new_s
62
+
63
+ return s.split(' ')
64
+
65
+ def trivial_tokenize_urdu(text):
66
+ """tokenize Urdu string
67
+
68
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
69
+ This also includes punctuations for the Urdu script.
70
+ These punctuations characters were identified from the Unicode database
71
+ for Arabic script by looking for punctuation symbols.
72
+
73
+ Args:
74
+ text (str): text to tokenize
75
+
76
+ Returns:
77
+ list: list of tokens
78
+ """
79
+ tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
80
+ return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
81
+
82
+ def trivial_tokenize(text,lang='hi'):
83
+ """trivial tokenizer for Indian languages using Brahmi for Arabic scripts
84
+
85
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
86
+ Major punctuations specific to Indian langauges are handled.
87
+ These punctuations characters were identified from the Unicode database.
88
+
89
+ Args:
90
+ text (str): text to tokenize
91
+ lang (str): ISO 639-2 language code
92
+
93
+ Returns:
94
+ list: list of tokens
95
+ """
96
+ if lang=='ur':
97
+ return trivial_tokenize_urdu(text)
98
+ else:
99
+ return trivial_tokenize_indic(text)
100
+
101
+ # if __name__ == '__main__':
102
+
103
+ # if len(sys.argv)<4:
104
+ # print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
105
+ # sys.exit(1)
106
+
107
+ # with open(sys.argv[1],'r', encoding='utf-8') as ifile:
108
+ # with open(sys.argv[2],'w', encoding='utf-8') as ofile:
109
+ # for line in ifile:
110
+ # tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
111
+ # ofile.write(tokenized_line)