|
from parsivar import Normalizer |
|
from parsivar import SpellCheck |
|
|
|
import num2fawords |
|
import re |
|
import string |
|
|
|
from dictionary import dictionary_mapping, fixator_dictionary |
|
|
|
_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True) |
|
_spell = SpellCheck() |
|
chars_to_ignore = [ |
|
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", |
|
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?", |
|
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„', |
|
'ā', 'š', 'ّ', 'ْ', |
|
] |
|
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits) |
|
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]""" |
|
zwnj = "\u200c" |
|
silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "] |
|
|
|
|
|
def multiple_replace(text, chars_to_mapping): |
|
pattern = "|".join(map(re.escape, chars_to_mapping.keys())) |
|
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) |
|
|
|
|
|
def remove_special_characters(text, chars_to_ignore_regex): |
|
text = re.sub(chars_to_ignore_regex, '', text).lower() + " " |
|
return text |
|
|
|
|
|
def convert_word_nums_to_text(word): |
|
try: |
|
word = int(word) |
|
word = num2fawords.words(word) |
|
except: |
|
word = word |
|
|
|
return word |
|
|
|
|
|
def normalizer_at_word_level(text): |
|
words = text.split() |
|
_text = [] |
|
|
|
for word in words: |
|
word = convert_word_nums_to_text(word) |
|
word = fixator_dictionary.get(word, word) |
|
|
|
_text.append(word) |
|
|
|
return " ".join(_text) + " " |
|
|
|
|
|
def finder(ss, s, starter=False): |
|
found = [] |
|
for m in re.finditer(ss, s): |
|
if starter: |
|
found.append(m.start()) |
|
else: |
|
found.append((m.start(), m.end())) |
|
|
|
return found |
|
|
|
|
|
def substring_replace(ss, s, start, end, stripped=True): |
|
s_start = s[:start] |
|
s_end = s[end:] |
|
|
|
counter = 0 |
|
if stripped: |
|
counter = 1 if s_start.endswith(" ") else counter |
|
s_start = s_start.rstrip() |
|
|
|
return s_start + ss + s_end, counter |
|
|
|
|
|
def normalizer( |
|
batch, |
|
is_normalize=True, |
|
is_spell_check=False, |
|
return_dict=True, |
|
filter_trivials=False, |
|
remove_extra_space=False |
|
): |
|
text = batch["sentence"].lower().strip() |
|
|
|
|
|
if is_normalize: |
|
text = _normalizer.normalize(text) |
|
|
|
|
|
text = multiple_replace(text, dictionary_mapping) |
|
text = re.sub(" +", " ", text) |
|
|
|
|
|
text = remove_special_characters(text, chars_to_ignore) |
|
text = re.sub(" +", " ", text) |
|
|
|
|
|
special, pointer = "آ", int("0") |
|
for f in sorted(finder(special, text, True)): |
|
index = f + pointer - 1 |
|
if len(text) >= index: |
|
if text[index] not in silent_chars: |
|
new_text, extra_pointer = substring_replace( |
|
f"{text[index]}{zwnj}", text, index, index + 1, stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
|
|
|
|
pointer = int("0") |
|
special_list = [ |
|
|
|
"هایمان", "هایم", "هایت", "هایش", |
|
"هایتان", "هایشان", "هام", "هات", |
|
"هاتان", "هامون", "هامان", "هاش", |
|
"هاتون", "هاشان", "هاشون", |
|
"هایی", "های", "هاس", "ها" |
|
] |
|
for special in special_list: |
|
pointer = 0 |
|
text = text |
|
for f in sorted(finder(special, text, False)): |
|
start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
|
if len(text) >= (end + 1): |
|
if len(text) == (end + 1): |
|
new_text, extra_pointer = substring_replace( |
|
f"{zwnj}{special}", |
|
text, |
|
start + 1, |
|
end + 1, |
|
stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
else: |
|
if text[end + 1] == " ": |
|
new_text, extra_pointer = substring_replace( |
|
f"{zwnj}{special}", |
|
text, |
|
start + 1, |
|
end + 1, |
|
stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
|
|
special, pointer = "افزار", int("0") |
|
for f in sorted(finder(special, text, False)): |
|
start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
|
|
|
if len(text) >= (end + 1): |
|
new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
|
|
|
|
pointer = int("0") |
|
special_list = [ |
|
"ترین", "تر" |
|
] |
|
for special in special_list: |
|
pointer = 0 |
|
text = text |
|
for f in sorted(finder(special, text, False)): |
|
start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
|
if len(text) >= (end + 1): |
|
if len(text) == (end + 1): |
|
new_text, extra_pointer = substring_replace( |
|
f"{zwnj}{special}", |
|
text, |
|
start + 1, |
|
end + 1, |
|
stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
else: |
|
if text[end + 1] == " ": |
|
new_text, extra_pointer = substring_replace( |
|
f"{zwnj}{special}", |
|
text, |
|
start + 1, |
|
end + 1, |
|
stripped=True) |
|
text = new_text |
|
pointer += 1 + 1 - 1 - extra_pointer |
|
|
|
|
|
if is_spell_check: |
|
text = _normalizer.normalize(_spell.spell_corrector(text)) |
|
|
|
|
|
text = normalizer_at_word_level(text) |
|
text = re.sub(" +", " ", text) |
|
|
|
if remove_extra_space: |
|
text = text.strip() |
|
else: |
|
text = text.strip() + " " |
|
|
|
if filter_trivials: |
|
if not len(text) > 2: |
|
text = None |
|
|
|
if not return_dict: |
|
return text |
|
|
|
batch["sentence"] = text |
|
return batch |
|
|
|
|
|
if __name__ == '__main__': |
|
input_text = "سلام بر شما که میآیید و میآموزید که بیآرآیم" |
|
print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
|
input_text = "کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند." |
|
print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
|
input_text = " میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها" |
|
print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
|
input_text = "این کتاب بهترین در نوع شتر آسانتر هست" |
|
print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
|
input_text = "سه چیز هست که از پژوهش در این زمینه آموختهام" |
|
print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|