devve1 commited on
Commit
8628daa
1 Parent(s): 0264b79

Update nlp.py

Browse files
Files changed (1) hide show
  1. nlp.py +2 -2
nlp.py CHANGED
@@ -3,8 +3,8 @@ import rag_tokenizer
3
  def tokenize(d, t, eng):
4
  d["content_with_weight"] = t
5
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
6
- d["content_ltks"] = tokenize(t)
7
- d["content_sm_ltks"] = fine_grained_tokenize(d["content_ltks"])
8
 
9
  def is_english(texts):
10
  eng = 0
 
3
  def tokenize(d, t, eng):
4
  d["content_with_weight"] = t
5
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
6
+ d["content_ltks"] = rag_tokenizer.tokenize(t)
7
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
8
 
9
  def is_english(texts):
10
  eng = 0