|
import os
|
|
import re
|
|
|
|
|
|
colon = ":"
|
|
comma = ","
|
|
exclamation_mark = "!"
|
|
period = re.escape(".")
|
|
question_mark = re.escape("?")
|
|
semicolon = ";"
|
|
|
|
left_curly_bracket = "{"
|
|
right_curly_bracket = "}"
|
|
quotation_mark = '"'
|
|
|
|
basic_punc = (
|
|
period
|
|
+ question_mark
|
|
+ comma
|
|
+ colon
|
|
+ exclamation_mark
|
|
+ left_curly_bracket
|
|
+ right_curly_bracket
|
|
)
|
|
|
|
|
|
zero_width_space = r"\u200B"
|
|
zero_width_nonjoiner = r"\u200C"
|
|
left_to_right_mark = r"\u200E"
|
|
right_to_left_mark = r"\u200F"
|
|
left_to_right_embedding = r"\u202A"
|
|
pop_directional_formatting = r"\u202C"
|
|
|
|
|
|
right_single_quotation_mark = r"\u2019"
|
|
left_single_quotation_mark = r"\u2018"
|
|
|
|
|
|
|
|
inverted_exclamation_mark = r"\u00A1"
|
|
inverted_question_mark = r"\u00BF"
|
|
|
|
|
|
|
|
hindi_danda = "\u0964"
|
|
|
|
|
|
|
|
arabic_comma = r"\u060C"
|
|
arabic_question_mark = r"\u061F"
|
|
arabic_semicolon = r"\u061B"
|
|
arabic_diacritics = r"\u064B-\u0652"
|
|
|
|
|
|
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
|
|
|
|
|
|
|
|
full_stop = r"\u3002"
|
|
full_comma = r"\uFF0C"
|
|
full_exclamation_mark = r"\uFF01"
|
|
full_question_mark = r"\uFF1F"
|
|
full_semicolon = r"\uFF1B"
|
|
full_colon = r"\uFF1A"
|
|
full_parentheses = r"\uFF08\uFF09"
|
|
quotation_mark_horizontal = r"\u300C-\u300F"
|
|
quotation_mark_vertical = r"\uFF41-\uFF44"
|
|
title_marks = r"\u3008-\u300B"
|
|
wavy_low_line = r"\uFE4F"
|
|
ellipsis = r"\u22EF"
|
|
enumeration_comma = r"\u3001"
|
|
hyphenation_point = r"\u2027"
|
|
forward_slash = r"\uFF0F"
|
|
wavy_dash = r"\uFF5E"
|
|
box_drawings_light_horizontal = r"\u2500"
|
|
fullwidth_low_line = r"\uFF3F"
|
|
chinese_punc = (
|
|
full_stop
|
|
+ full_comma
|
|
+ full_exclamation_mark
|
|
+ full_question_mark
|
|
+ full_semicolon
|
|
+ full_colon
|
|
+ full_parentheses
|
|
+ quotation_mark_horizontal
|
|
+ quotation_mark_vertical
|
|
+ title_marks
|
|
+ wavy_low_line
|
|
+ ellipsis
|
|
+ enumeration_comma
|
|
+ hyphenation_point
|
|
+ forward_slash
|
|
+ wavy_dash
|
|
+ box_drawings_light_horizontal
|
|
+ fullwidth_low_line
|
|
)
|
|
|
|
|
|
armenian_apostrophe = r"\u055A"
|
|
emphasis_mark = r"\u055B"
|
|
exclamation_mark = r"\u055C"
|
|
armenian_comma = r"\u055D"
|
|
armenian_question_mark = r"\u055E"
|
|
abbreviation_mark = r"\u055F"
|
|
armenian_full_stop = r"\u0589"
|
|
armenian_punc = (
|
|
armenian_apostrophe
|
|
+ emphasis_mark
|
|
+ exclamation_mark
|
|
+ armenian_comma
|
|
+ armenian_question_mark
|
|
+ abbreviation_mark
|
|
+ armenian_full_stop
|
|
)
|
|
|
|
lesser_than_symbol = r"<"
|
|
greater_than_symbol = r">"
|
|
|
|
lesser_than_sign = r"\u003c"
|
|
greater_than_sign = r"\u003e"
|
|
|
|
nbsp_written_form = r" "
|
|
|
|
|
|
left_double_quotes = r"\u201c"
|
|
right_double_quotes = r"\u201d"
|
|
left_double_angle = r"\u00ab"
|
|
right_double_angle = r"\u00bb"
|
|
left_single_angle = r"\u2039"
|
|
right_single_angle = r"\u203a"
|
|
low_double_quotes = r"\u201e"
|
|
low_single_quotes = r"\u201a"
|
|
high_double_quotes = r"\u201f"
|
|
high_single_quotes = r"\u201b"
|
|
|
|
all_punct_quotes = (
|
|
left_double_quotes
|
|
+ right_double_quotes
|
|
+ left_double_angle
|
|
+ right_double_angle
|
|
+ left_single_angle
|
|
+ right_single_angle
|
|
+ low_double_quotes
|
|
+ low_single_quotes
|
|
+ high_double_quotes
|
|
+ high_single_quotes
|
|
+ right_single_quotation_mark
|
|
+ left_single_quotation_mark
|
|
)
|
|
mapping_quotes = (
|
|
"["
|
|
+ high_single_quotes
|
|
+ right_single_quotation_mark
|
|
+ left_single_quotation_mark
|
|
+ "]"
|
|
)
|
|
|
|
|
|
|
|
|
|
english_digits = r"\u0030-\u0039"
|
|
bengali_digits = r"\u09e6-\u09ef"
|
|
khmer_digits = r"\u17e0-\u17e9"
|
|
devanagari_digits = r"\u0966-\u096f"
|
|
oriya_digits = r"\u0b66-\u0b6f"
|
|
extended_arabic_indic_digits = r"\u06f0-\u06f9"
|
|
kayah_li_digits = r"\ua900-\ua909"
|
|
fullwidth_digits = r"\uff10-\uff19"
|
|
malayam_digits = r"\u0d66-\u0d6f"
|
|
myanmar_digits = r"\u1040-\u1049"
|
|
roman_numeral = r"\u2170-\u2179"
|
|
nominal_digit_shapes = r"\u206f"
|
|
|
|
|
|
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r", encoding='utf-8') as punc_f:
|
|
punc_list = punc_f.readlines()
|
|
|
|
punct_pattern = r""
|
|
for punc in punc_list:
|
|
|
|
punct_pattern += re.escape(punc.split("\t")[0])
|
|
|
|
shared_digits = (
|
|
english_digits
|
|
+ bengali_digits
|
|
+ khmer_digits
|
|
+ devanagari_digits
|
|
+ oriya_digits
|
|
+ extended_arabic_indic_digits
|
|
+ kayah_li_digits
|
|
+ fullwidth_digits
|
|
+ malayam_digits
|
|
+ myanmar_digits
|
|
+ roman_numeral
|
|
+ nominal_digit_shapes
|
|
)
|
|
|
|
shared_punc_list = (
|
|
basic_punc
|
|
+ all_punct_quotes
|
|
+ greater_than_sign
|
|
+ lesser_than_sign
|
|
+ inverted_question_mark
|
|
+ full_stop
|
|
+ semicolon
|
|
+ armenian_punc
|
|
+ inverted_exclamation_mark
|
|
+ arabic_comma
|
|
+ enumeration_comma
|
|
+ hindi_danda
|
|
+ quotation_mark
|
|
+ arabic_semicolon
|
|
+ arabic_question_mark
|
|
+ chinese_punc
|
|
+ punct_pattern
|
|
)
|
|
|
|
shared_mappping = {
|
|
lesser_than_symbol: "",
|
|
greater_than_symbol: "",
|
|
nbsp_written_form: "",
|
|
|
|
}
|
|
|
|
shared_deletion_list = (
|
|
left_to_right_mark
|
|
+ zero_width_nonjoiner
|
|
+ arabic_subscript_alef_and_inverted_damma
|
|
+ zero_width_space
|
|
+ arabic_diacritics
|
|
+ pop_directional_formatting
|
|
+ right_to_left_mark
|
|
+ left_to_right_embedding
|
|
)
|
|
|
|
norm_config = {
|
|
"*": {
|
|
"lower_case": True,
|
|
"punc_set": shared_punc_list,
|
|
"del_set": shared_deletion_list,
|
|
"mapping": shared_mappping,
|
|
"digit_set": shared_digits,
|
|
"unicode_norm": "NFKC",
|
|
"rm_diacritics": False,
|
|
}
|
|
}
|
|
|
|
|
|
|
|
norm_config["mon"] = norm_config["*"].copy()
|
|
|
|
norm_config["mon"]["del_set"] += r"\u00AD"
|
|
|
|
norm_config["khk"] = norm_config["mon"].copy()
|
|
|
|
|
|
|
|
norm_config["heb"] = norm_config["*"].copy()
|
|
|
|
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
|
|
|
|
|
|
|
norm_config["tha"] = norm_config["*"].copy()
|
|
|
|
norm_config["tha"]["punc_set"] += r"\u200D"
|
|
|
|
|
|
norm_config["ara"] = norm_config["*"].copy()
|
|
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
|
norm_config["arb"] = norm_config["ara"].copy()
|
|
|
|
|
|
norm_config["jav"] = norm_config["*"].copy()
|
|
norm_config["jav"]["rm_diacritics"] = True
|
|
|