Spaces:
Running
on
T4
Running
on
T4
File size: 507 Bytes
0264b79 8628daa 0264b79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
import rag_tokenizer
def tokenize(d, t, eng):
d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def is_english(texts):
eng = 0
if not texts: return False
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False |