Update rag_tokenizer.py
Browse files- rag_tokenizer.py +20 -45
rag_tokenizer.py
CHANGED
@@ -15,7 +15,7 @@
|
|
15 |
#
|
16 |
|
17 |
import copy
|
18 |
-
import
|
19 |
import math
|
20 |
import os
|
21 |
import re
|
@@ -25,7 +25,6 @@ from hanziconv import HanziConv
|
|
25 |
from huggingface_hub import snapshot_download
|
26 |
from nltk import word_tokenize
|
27 |
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
28 |
-
from api.utils.file_utils import get_project_base_directory
|
29 |
|
30 |
|
31 |
class RagTokenizer:
|
@@ -38,48 +37,37 @@ class RagTokenizer:
|
|
38 |
def loadDict_(self, fnm):
|
39 |
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
|
40 |
try:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
self.trie_.save(fnm + ".trie")
|
54 |
-
of.close()
|
55 |
except Exception as e:
|
56 |
-
print("[HUQIE]:
|
57 |
|
58 |
def __init__(self, debug=False):
|
59 |
self.DEBUG = debug
|
60 |
self.DENOMINATOR = 1000000
|
61 |
-
self.trie_ =
|
62 |
-
self.DIR_ = os.path.join(
|
63 |
|
64 |
self.stemmer = PorterStemmer()
|
65 |
self.lemmatizer = WordNetLemmatizer()
|
66 |
|
67 |
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γγοΌγοΌγοΌββοΌββγγ~οΌοΏ₯%β¦β¦οΌοΌββ-]+|[a-z\.-]+|[0-9,\.-]+)"
|
68 |
try:
|
69 |
-
self.
|
70 |
-
return
|
71 |
except Exception as e:
|
72 |
print("[HUQIE]:Build default trie", file=sys.stderr)
|
73 |
-
self.trie_ = datrie.Trie(string.printable)
|
74 |
-
|
75 |
-
self.loadDict_(self.DIR_ + ".txt")
|
76 |
|
77 |
def loadUserDict(self, fnm):
|
78 |
-
try:
|
79 |
-
self.trie_ = datrie.Trie.load(fnm + ".trie")
|
80 |
-
return
|
81 |
-
except Exception as e:
|
82 |
-
self.trie_ = datrie.Trie(string.printable)
|
83 |
self.loadDict_(fnm)
|
84 |
|
85 |
def addUserDict(self, fnm):
|
@@ -357,26 +345,13 @@ class RagTokenizer:
|
|
357 |
|
358 |
|
359 |
def is_chinese(s):
|
360 |
-
|
361 |
-
return True
|
362 |
-
else:
|
363 |
-
return False
|
364 |
-
|
365 |
|
366 |
def is_number(s):
|
367 |
-
|
368 |
-
return True
|
369 |
-
else:
|
370 |
-
return False
|
371 |
-
|
372 |
|
373 |
def is_alphabet(s):
|
374 |
-
|
375 |
-
s >= u'\u0061' and s <= u'\u007a'):
|
376 |
-
return True
|
377 |
-
else:
|
378 |
-
return False
|
379 |
-
|
380 |
|
381 |
def naiveQie(txt):
|
382 |
tks = []
|
|
|
15 |
#
|
16 |
|
17 |
import copy
|
18 |
+
import pygtrie
|
19 |
import math
|
20 |
import os
|
21 |
import re
|
|
|
25 |
from huggingface_hub import snapshot_download
|
26 |
from nltk import word_tokenize
|
27 |
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
|
|
28 |
|
29 |
|
30 |
class RagTokenizer:
|
|
|
37 |
def loadDict_(self, fnm):
|
38 |
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
|
39 |
try:
|
40 |
+
with open(fnm, "r", encoding='utf-8') as of:
|
41 |
+
while True:
|
42 |
+
line = of.readline()
|
43 |
+
if not line:
|
44 |
+
break
|
45 |
+
line = re.sub(r"[\r\n]+", "", line)
|
46 |
+
line = re.split(r"[ \t]", line)
|
47 |
+
k = self.key_(line[0])
|
48 |
+
F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
|
49 |
+
if k not in self.trie_ or self.trie_[k][0] < F:
|
50 |
+
self.trie_[self.key_(line[0])] = (F, line[2])
|
51 |
+
self.trie_[self.rkey_(line[0])] = 1
|
|
|
|
|
52 |
except Exception as e:
|
53 |
+
print("[HUQIE]:Failed to build trie, ", fnm, e, file=sys.stderr)
|
54 |
|
55 |
def __init__(self, debug=False):
|
56 |
self.DEBUG = debug
|
57 |
self.DENOMINATOR = 1000000
|
58 |
+
self.trie_ = pygtrie.CharTrie()
|
59 |
+
self.DIR_ = os.path.join(os.getenv('HF_HOME'), "huqie")
|
60 |
|
61 |
self.stemmer = PorterStemmer()
|
62 |
self.lemmatizer = WordNetLemmatizer()
|
63 |
|
64 |
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γγοΌγοΌγοΌββοΌββγγ~οΌοΏ₯%β¦β¦οΌοΌββ-]+|[a-z\.-]+|[0-9,\.-]+)"
|
65 |
try:
|
66 |
+
self.loadDict_(self.DIR_ + ".txt")
|
|
|
67 |
except Exception as e:
|
68 |
print("[HUQIE]:Build default trie", file=sys.stderr)
|
|
|
|
|
|
|
69 |
|
70 |
def loadUserDict(self, fnm):
|
|
|
|
|
|
|
|
|
|
|
71 |
self.loadDict_(fnm)
|
72 |
|
73 |
def addUserDict(self, fnm):
|
|
|
345 |
|
346 |
|
347 |
def is_chinese(s):
|
348 |
+
return u'\u4e00' <= s <= u'\u9fa5'
|
|
|
|
|
|
|
|
|
349 |
|
350 |
def is_number(s):
|
351 |
+
return u'\u0030' <= s <= u'\u0039'
|
|
|
|
|
|
|
|
|
352 |
|
353 |
def is_alphabet(s):
|
354 |
+
return (u'\u0041' <= s <= u'\u005a') or (u'\u0061' <= s <= u'\u007a')
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
def naiveQie(txt):
|
357 |
tks = []
|