import rag_tokenizer def tokenize(d, t, eng): d["content_with_weight"] = t t = re.sub(r"]{0,12})?>", " ", t) d["content_ltks"] = rag_tokenizer.tokenize(t) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) def is_english(texts): eng = 0 if not texts: return False for t in texts: if re.match(r"[a-zA-Z]{2,}", t.strip()): eng += 1 if eng / len(texts) > 0.8: return True return False