Update nlp.py
Browse files
nlp.py
CHANGED
@@ -3,8 +3,8 @@ import rag_tokenizer
|
|
3 |
def tokenize(d, t, eng):
|
4 |
d["content_with_weight"] = t
|
5 |
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
6 |
-
d["content_ltks"] = tokenize(t)
|
7 |
-
d["content_sm_ltks"] = fine_grained_tokenize(d["content_ltks"])
|
8 |
|
9 |
def is_english(texts):
|
10 |
eng = 0
|
|
|
3 |
def tokenize(d, t, eng):
|
4 |
d["content_with_weight"] = t
|
5 |
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
6 |
+
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
7 |
+
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
8 |
|
9 |
def is_english(texts):
|
10 |
eng = 0
|