SAMI
commited on
Commit
•
14b1d78
1
Parent(s):
afb491e
commit from sami
Browse files- checkpoint10.pt +3 -0
- indic_tokenize.py.py +111 -0
checkpoint10.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:326ea3f506d52cbd6e75fd9951acde3347e8ad681f994cef3ea763964374a9fd
|
3 |
+
size 1052083291
|
indic_tokenize.py.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
#Program for tokenizing Indian language input
|
10 |
+
#
|
11 |
+
# @author Anoop Kunchukuttan
|
12 |
+
#
|
13 |
+
"""
|
14 |
+
Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
|
15 |
+
are supported (see `trivial_tokenize`). Major Indian language punctuations are
|
16 |
+
handled.
|
17 |
+
"""
|
18 |
+
import string, re, sys
|
19 |
+
|
20 |
+
from indicnlp.common import IndicNlpException
|
21 |
+
|
22 |
+
### tokenizer patterns
|
23 |
+
triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
|
24 |
+
triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')
|
25 |
+
|
26 |
+
## date, numbers, section/article numbering
|
27 |
+
pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
|
28 |
+
|
29 |
+
def trivial_tokenize_indic(text):
|
30 |
+
"""tokenize string for Indian language scripts using Brahmi-derived scripts
|
31 |
+
|
32 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
33 |
+
This also includes punctuations for the Indian language scripts (the
|
34 |
+
purna virama and the deergha virama). This is a language independent
|
35 |
+
tokenizer
|
36 |
+
|
37 |
+
Args:
|
38 |
+
text (str): text to tokenize
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
list: list of tokens
|
42 |
+
|
43 |
+
"""
|
44 |
+
tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
|
45 |
+
# return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
|
46 |
+
|
47 |
+
s=re.sub(r'[ ]+',' ',tok_str).strip(' ')
|
48 |
+
|
49 |
+
# do not tokenize numbers and dates
|
50 |
+
new_s=''
|
51 |
+
prev=0
|
52 |
+
for m in pat_num_seq.finditer(s):
|
53 |
+
start=m.start()
|
54 |
+
end=m.end()
|
55 |
+
if start>prev:
|
56 |
+
new_s=new_s+s[prev:start]
|
57 |
+
new_s=new_s+s[start:end].replace(' ','')
|
58 |
+
prev=end
|
59 |
+
|
60 |
+
new_s=new_s+s[prev:]
|
61 |
+
s=new_s
|
62 |
+
|
63 |
+
return s.split(' ')
|
64 |
+
|
65 |
+
def trivial_tokenize_urdu(text):
|
66 |
+
"""tokenize Urdu string
|
67 |
+
|
68 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
69 |
+
This also includes punctuations for the Urdu script.
|
70 |
+
These punctuations characters were identified from the Unicode database
|
71 |
+
for Arabic script by looking for punctuation symbols.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
text (str): text to tokenize
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
list: list of tokens
|
78 |
+
"""
|
79 |
+
tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
|
80 |
+
return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
|
81 |
+
|
82 |
+
def trivial_tokenize(text,lang='hi'):
|
83 |
+
"""trivial tokenizer for Indian languages using Brahmi for Arabic scripts
|
84 |
+
|
85 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
86 |
+
Major punctuations specific to Indian langauges are handled.
|
87 |
+
These punctuations characters were identified from the Unicode database.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
text (str): text to tokenize
|
91 |
+
lang (str): ISO 639-2 language code
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
list: list of tokens
|
95 |
+
"""
|
96 |
+
if lang=='ur':
|
97 |
+
return trivial_tokenize_urdu(text)
|
98 |
+
else:
|
99 |
+
return trivial_tokenize_indic(text)
|
100 |
+
|
101 |
+
# if __name__ == '__main__':
|
102 |
+
|
103 |
+
# if len(sys.argv)<4:
|
104 |
+
# print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
|
105 |
+
# sys.exit(1)
|
106 |
+
|
107 |
+
# with open(sys.argv[1],'r', encoding='utf-8') as ifile:
|
108 |
+
# with open(sys.argv[2],'w', encoding='utf-8') as ofile:
|
109 |
+
# for line in ifile:
|
110 |
+
# tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
|
111 |
+
# ofile.write(tokenized_line)
|