Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

App Files Files Community

devve1 commited on Jul 29

Commit

6edbf84

•

1 Parent(s): 68fd265

Update rag_tokenizer.py

Browse files

Files changed (1) hide show

rag_tokenizer.py +20 -45

rag_tokenizer.py CHANGED Viewed

@@ -15,7 +15,7 @@
 #
 import copy
-import datrie
 import math
 import os
 import re
@@ -25,7 +25,6 @@ from hanziconv import HanziConv
 from huggingface_hub import snapshot_download
 from nltk import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
-from api.utils.file_utils import get_project_base_directory
 class RagTokenizer:
@@ -38,48 +37,37 @@ class RagTokenizer:
     def loadDict_(self, fnm):
         print("[HUQIE]:Build trie", fnm, file=sys.stderr)
         try:
-            of = open(fnm, "r", encoding='utf-8')
-            while True:
-                line = of.readline()
-                if not line:
-                    break
-                line = re.sub(r"[\r\n]+", "", line)
-                line = re.split(r"[ \t]", line)
-                k = self.key_(line[0])
-                F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
-                if k not in self.trie_ or self.trie_[k][0] < F:
-                    self.trie_[self.key_(line[0])] = (F, line[2])
-                self.trie_[self.rkey_(line[0])] = 1
-            self.trie_.save(fnm + ".trie")
-            of.close()
         except Exception as e:
-            print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
     def __init__(self, debug=False):
         self.DEBUG = debug
         self.DENOMINATOR = 1000000
-        self.trie_ = datrie.Trie(string.printable)
-        self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
         self.stemmer = PorterStemmer()
         self.lemmatizer = WordNetLemmatizer()
         self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
         try:
-            self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
-            return
         except Exception as e:
             print("[HUQIE]:Build default trie", file=sys.stderr)
-            self.trie_ = datrie.Trie(string.printable)
-        self.loadDict_(self.DIR_ + ".txt")
     def loadUserDict(self, fnm):
-        try:
-            self.trie_ = datrie.Trie.load(fnm + ".trie")
-            return
-        except Exception as e:
-            self.trie_ = datrie.Trie(string.printable)
         self.loadDict_(fnm)
     def addUserDict(self, fnm):
@@ -357,26 +345,13 @@ class RagTokenizer:
 def is_chinese(s):
-    if s >= u'\u4e00' and s <= u'\u9fa5':
-        return True
-    else:
-        return False
 def is_number(s):
-    if s >= u'\u0030' and s <= u'\u0039':
-        return True
-    else:
-        return False
 def is_alphabet(s):
-    if (s >= u'\u0041' and s <= u'\u005a') or (
-            s >= u'\u0061' and s <= u'\u007a'):
-        return True
-    else:
-        return False
 def naiveQie(txt):
     tks = []

 #
 import copy
+import pygtrie
 import math
 import os
 import re
 from huggingface_hub import snapshot_download
 from nltk import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
 class RagTokenizer:
     def loadDict_(self, fnm):
         print("[HUQIE]:Build trie", fnm, file=sys.stderr)
         try:
+            with open(fnm, "r", encoding='utf-8') as of:
+                while True:
+                    line = of.readline()
+                    if not line:
+                        break
+                    line = re.sub(r"[\r\n]+", "", line)
+                    line = re.split(r"[ \t]", line)
+                    k = self.key_(line[0])
+                    F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
+                    if k not in self.trie_ or self.trie_[k][0] < F:
+                        self.trie_[self.key_(line[0])] = (F, line[2])
+                    self.trie_[self.rkey_(line[0])] = 1
         except Exception as e:
+            print("[HUQIE]:Failed to build trie, ", fnm, e, file=sys.stderr)
     def __init__(self, debug=False):
         self.DEBUG = debug
         self.DENOMINATOR = 1000000
+        self.trie_ = pygtrie.CharTrie()
+        self.DIR_ = os.path.join(os.getenv('HF_HOME'), "huqie")
         self.stemmer = PorterStemmer()
         self.lemmatizer = WordNetLemmatizer()
         self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
         try:
+            self.loadDict_(self.DIR_ + ".txt")
         except Exception as e:
             print("[HUQIE]:Build default trie", file=sys.stderr)
     def loadUserDict(self, fnm):
         self.loadDict_(fnm)
     def addUserDict(self, fnm):
 def is_chinese(s):
+    return u'\u4e00' <= s <= u'\u9fa5'
 def is_number(s):
+    return u'\u0030' <= s <= u'\u0039'
 def is_alphabet(s):
+    return (u'\u0041' <= s <= u'\u005a') or (u'\u0061' <= s <= u'\u007a')
 def naiveQie(txt):
     tks = []