Spaces:
Runtime error
Runtime error
File size: 1,922 Bytes
3238f2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import hazm
from cleantext import clean
import regex as re
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def cleaning(text):
text = text.strip()
# regular cleaning
# https://pypi.org/project/clean-text/ >> works well for eng and de languages
text = clean(text,
fix_unicode=True,
to_ascii=False,
lower=True,
no_line_breaks=True,
no_urls=True,
no_emails=True,
no_phone_numbers=True,
no_numbers=False,
no_digits=False,
no_currency_symbols=True,
no_punct=False, #Keep the punc
replace_with_url="",
replace_with_email="",
replace_with_phone_number="",
replace_with_number="",
replace_with_digit="0",
replace_with_currency_symbol="",
)
# cleaning htmls
text = cleanhtml(text)
# normalizing > https://github.com/sobhe/hazm
normalizer = hazm.Normalizer()
text = normalizer.normalize(text)
# removing wierd patterns
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
text = re.sub("#", "", text)
text = re.sub("\s+", " ", text)
return text
|