devve1 commited on
Commit
6edbf84
β€’
1 Parent(s): 68fd265

Update rag_tokenizer.py

Browse files
Files changed (1) hide show
  1. rag_tokenizer.py +20 -45
rag_tokenizer.py CHANGED
@@ -15,7 +15,7 @@
15
  #
16
 
17
  import copy
18
- import datrie
19
  import math
20
  import os
21
  import re
@@ -25,7 +25,6 @@ from hanziconv import HanziConv
25
  from huggingface_hub import snapshot_download
26
  from nltk import word_tokenize
27
  from nltk.stem import PorterStemmer, WordNetLemmatizer
28
- from api.utils.file_utils import get_project_base_directory
29
 
30
 
31
  class RagTokenizer:
@@ -38,48 +37,37 @@ class RagTokenizer:
38
  def loadDict_(self, fnm):
39
  print("[HUQIE]:Build trie", fnm, file=sys.stderr)
40
  try:
41
- of = open(fnm, "r", encoding='utf-8')
42
- while True:
43
- line = of.readline()
44
- if not line:
45
- break
46
- line = re.sub(r"[\r\n]+", "", line)
47
- line = re.split(r"[ \t]", line)
48
- k = self.key_(line[0])
49
- F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
50
- if k not in self.trie_ or self.trie_[k][0] < F:
51
- self.trie_[self.key_(line[0])] = (F, line[2])
52
- self.trie_[self.rkey_(line[0])] = 1
53
- self.trie_.save(fnm + ".trie")
54
- of.close()
55
  except Exception as e:
56
- print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
57
 
58
  def __init__(self, debug=False):
59
  self.DEBUG = debug
60
  self.DENOMINATOR = 1000000
61
- self.trie_ = datrie.Trie(string.printable)
62
- self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
63
 
64
  self.stemmer = PorterStemmer()
65
  self.lemmatizer = WordNetLemmatizer()
66
 
67
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γ€Šγ€‹οΌŒγ€‚οΌŸγ€οΌ›β€˜β€™οΌšβ€œβ€γ€γ€‘~!οΏ₯%β€¦β€¦οΌˆοΌ‰β€”β€”-]+|[a-z\.-]+|[0-9,\.-]+)"
68
  try:
69
- self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
70
- return
71
  except Exception as e:
72
  print("[HUQIE]:Build default trie", file=sys.stderr)
73
- self.trie_ = datrie.Trie(string.printable)
74
-
75
- self.loadDict_(self.DIR_ + ".txt")
76
 
77
  def loadUserDict(self, fnm):
78
- try:
79
- self.trie_ = datrie.Trie.load(fnm + ".trie")
80
- return
81
- except Exception as e:
82
- self.trie_ = datrie.Trie(string.printable)
83
  self.loadDict_(fnm)
84
 
85
  def addUserDict(self, fnm):
@@ -357,26 +345,13 @@ class RagTokenizer:
357
 
358
 
359
  def is_chinese(s):
360
- if s >= u'\u4e00' and s <= u'\u9fa5':
361
- return True
362
- else:
363
- return False
364
-
365
 
366
  def is_number(s):
367
- if s >= u'\u0030' and s <= u'\u0039':
368
- return True
369
- else:
370
- return False
371
-
372
 
373
  def is_alphabet(s):
374
- if (s >= u'\u0041' and s <= u'\u005a') or (
375
- s >= u'\u0061' and s <= u'\u007a'):
376
- return True
377
- else:
378
- return False
379
-
380
 
381
  def naiveQie(txt):
382
  tks = []
 
15
  #
16
 
17
  import copy
18
+ import pygtrie
19
  import math
20
  import os
21
  import re
 
25
  from huggingface_hub import snapshot_download
26
  from nltk import word_tokenize
27
  from nltk.stem import PorterStemmer, WordNetLemmatizer
 
28
 
29
 
30
  class RagTokenizer:
 
37
  def loadDict_(self, fnm):
38
  print("[HUQIE]:Build trie", fnm, file=sys.stderr)
39
  try:
40
+ with open(fnm, "r", encoding='utf-8') as of:
41
+ while True:
42
+ line = of.readline()
43
+ if not line:
44
+ break
45
+ line = re.sub(r"[\r\n]+", "", line)
46
+ line = re.split(r"[ \t]", line)
47
+ k = self.key_(line[0])
48
+ F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
49
+ if k not in self.trie_ or self.trie_[k][0] < F:
50
+ self.trie_[self.key_(line[0])] = (F, line[2])
51
+ self.trie_[self.rkey_(line[0])] = 1
 
 
52
  except Exception as e:
53
+ print("[HUQIE]:Failed to build trie, ", fnm, e, file=sys.stderr)
54
 
55
  def __init__(self, debug=False):
56
  self.DEBUG = debug
57
  self.DENOMINATOR = 1000000
58
+ self.trie_ = pygtrie.CharTrie()
59
+ self.DIR_ = os.path.join(os.getenv('HF_HOME'), "huqie")
60
 
61
  self.stemmer = PorterStemmer()
62
  self.lemmatizer = WordNetLemmatizer()
63
 
64
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γ€Šγ€‹οΌŒγ€‚οΌŸγ€οΌ›β€˜β€™οΌšβ€œβ€γ€γ€‘~!οΏ₯%β€¦β€¦οΌˆοΌ‰β€”β€”-]+|[a-z\.-]+|[0-9,\.-]+)"
65
  try:
66
+ self.loadDict_(self.DIR_ + ".txt")
 
67
  except Exception as e:
68
  print("[HUQIE]:Build default trie", file=sys.stderr)
 
 
 
69
 
70
  def loadUserDict(self, fnm):
 
 
 
 
 
71
  self.loadDict_(fnm)
72
 
73
  def addUserDict(self, fnm):
 
345
 
346
 
347
  def is_chinese(s):
348
+ return u'\u4e00' <= s <= u'\u9fa5'
 
 
 
 
349
 
350
  def is_number(s):
351
+ return u'\u0030' <= s <= u'\u0039'
 
 
 
 
352
 
353
  def is_alphabet(s):
354
+ return (u'\u0041' <= s <= u'\u005a') or (u'\u0061' <= s <= u'\u007a')
 
 
 
 
 
355
 
356
  def naiveQie(txt):
357
  tks = []