Spaces:
Running
on
T4
Running
on
T4
import rag_tokenizer | |
def tokenize(d, t, eng): | |
d["content_with_weight"] = t | |
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) | |
d["content_ltks"] = rag_tokenizer.tokenize(t) | |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |
def is_english(texts): | |
eng = 0 | |
if not texts: return False | |
for t in texts: | |
if re.match(r"[a-zA-Z]{2,}", t.strip()): | |
eng += 1 | |
if eng / len(texts) > 0.8: | |
return True | |
return False |