|
import os |
|
import kenlm |
|
import sentencepiece as spm |
|
from tokenizers import normalizers, Regex |
|
|
|
|
|
|
|
char_map = { |
|
|
|
|
|
|
|
|
|
"\uFE83": "\u0623", |
|
"\uFE84": "\u0623", |
|
|
|
|
|
"\uFE89": "\u0626", |
|
"\uFE8A": "\u0626", |
|
"\uFE8B": "\u0626", |
|
"\uFE8C": "\u0626", |
|
|
|
|
|
"\uFE85": "\u0624", |
|
"\uFE86": "\u0624", |
|
"\u0676": "\u0624", |
|
|
|
|
|
"\uFE81": "\u0622", |
|
"\uFE82": "\u0622", |
|
|
|
|
|
"\uFB50": "\u0627", |
|
"\uFE87": "\u0627", |
|
"\u0675": "\u0627", |
|
"\u0625": "\u0627", |
|
"\uFE8D": "\u0627", |
|
"\uFE8E": "\u0627", |
|
"\u1EE00": "\u0627", |
|
"\u1EE80": "\u0627", |
|
|
|
|
|
"\uFE8F": "\u0628", |
|
"\uFE90": "\u0628", |
|
"\uFE91": "\u0628", |
|
"\uFE92": "\u0628", |
|
"\u1EE01": "\u0628", |
|
"\u1EE21": "\u0628", |
|
"\u1EE61": "\u0628", |
|
"\u1EE81": "\u0628", |
|
"\u1EEA1": "\u0628", |
|
|
|
|
|
"\uFB56": "\u067E", |
|
"\uFB57": "\u067E", |
|
"\uFB58": "\u067E", |
|
"\uFB59": "\u067E", |
|
|
|
|
|
"\uFE95": "\u062A", |
|
"\uFE96": "\u062A", |
|
"\uFE97": "\u062A", |
|
"\uFE98": "\u062A", |
|
"\u1EE15": "\u062A", |
|
"\u1EE35": "\u062A", |
|
"\u1EE75": "\u062A", |
|
"\u1EE95": "\u062A", |
|
"\u1EEB5": "\u062A", |
|
|
|
|
|
"\uFE99": "\u062B", |
|
"\uFE9A": "\u062B", |
|
"\uFE9B": "\u062B", |
|
"\uFE9C": "\u062B", |
|
"\u1EE16": "\u062B", |
|
"\u1EE36": "\u062B", |
|
"\u1EE76": "\u062B", |
|
"\u1EE96": "\u062B", |
|
"\u1EEB6": "\u062B", |
|
|
|
|
|
"\uFE9D": "\u062C", |
|
"\uFE9E": "\u062C", |
|
"\uFE9F": "\u062C", |
|
"\uFEA0": "\u062C", |
|
"\u1EE02": "\u062C", |
|
"\u1EE22": "\u062C", |
|
"\u1EE42": "\u062C", |
|
"\u1EE62": "\u062C", |
|
"\u1EE82": "\u062C", |
|
"\u1EEA2": "\u062C", |
|
|
|
|
|
"\uFB7A": "\u0686", |
|
"\uFB7B": "\u0686", |
|
"\uFB7C": "\u0686", |
|
"\uFB7D": "\u0686", |
|
|
|
|
|
"\uFEA1": "\u062D", |
|
"\uFEA2": "\u062D", |
|
"\uFEA3": "\u062D", |
|
"\uFEA4": "\u062D", |
|
"\u1EE07": "\u062D", |
|
"\u1EE27": "\u062D", |
|
"\u1EE47": "\u062D", |
|
"\u1EE67": "\u062D", |
|
"\u1EE87": "\u062D", |
|
"\u1EEA7": "\u062D", |
|
|
|
|
|
"\uFEA5": "\u062E", |
|
"\uFEA6": "\u062E", |
|
"\uFEA7": "\u062E", |
|
"\uFEA8": "\u062E", |
|
"\u1EE17": "\u062E", |
|
"\u1EE37": "\u062E", |
|
"\u1EE57": "\u062E", |
|
"\u1EE77": "\u062E", |
|
"\u1EE97": "\u062E", |
|
"\u1EEB7": "\u062E", |
|
|
|
|
|
"\uFEA9": "\u062F", |
|
"\uFEAA": "\u062F", |
|
"\u1EE03": "\u062F", |
|
"\u1EE83": "\u062F", |
|
"\u1EEA3": "\u062F", |
|
|
|
|
|
"\uFEAB": "\u0630", |
|
"\uFEAC": "\u0630", |
|
"\u1EE18": "\u0630", |
|
"\u1EE98": "\u0630", |
|
"\u1EEB8": "\u0630", |
|
|
|
|
|
"\uFEAE": "\u0631", |
|
"\uFEAD": "\u0631", |
|
"\u0692": "\u0631", |
|
"\u1EE13": "\u0631", |
|
"\u1EE93": "\u0631", |
|
"\u1EEB3": "\u0631", |
|
|
|
|
|
"\uFEAF": "\u0632", |
|
"\uFEB0": "\u0632", |
|
"\u1EE06": "\u0632", |
|
"\u1EE86": "\u0632", |
|
"\u1EEA6": "\u0632", |
|
|
|
|
|
"\uFB8A": "\u0698", |
|
"\uFB8B": "\u0698", |
|
|
|
|
|
"\uFEB1": "\u0633", |
|
"\uFEB2": "\u0633", |
|
"\uFEB3": "\u0633", |
|
"\uFEB4": "\u0633", |
|
"\u1EE0E": "\u0633", |
|
"\u1EE2E": "\u0633", |
|
"\u1EE4E": "\u0633", |
|
"\u1EE6E": "\u0633", |
|
"\u1EE8E": "\u0633", |
|
"\u1EEAE": "\u0633", |
|
|
|
|
|
"\uFEB5": "\u0634", |
|
"\uFEB6": "\u0634", |
|
"\uFEB7": "\u0634", |
|
"\uFEB8": "\u0634", |
|
"\u1EE14": "\u0634", |
|
"\u1EE34": "\u0634", |
|
"\u1EE54": "\u0634", |
|
"\u1EE74": "\u0634", |
|
"\u1EE94": "\u0634", |
|
"\u1EEB4": "\u0634", |
|
|
|
|
|
"\uFEB9": "\u0635", |
|
"\uFEBA": "\u0635", |
|
"\uFEBB": "\u0635", |
|
"\uFEBC": "\u0635", |
|
"\u1EE11": "\u0635", |
|
"\u1EE31": "\u0635", |
|
"\u1EE51": "\u0635", |
|
"\u1EE71": "\u0635", |
|
"\u1EE91": "\u0635", |
|
"\u1EEB1": "\u0635", |
|
|
|
|
|
"\uFEBD": "\u0636", |
|
"\uFEBE": "\u0636", |
|
"\uFEBF": "\u0636", |
|
"\uFEC0": "\u0636", |
|
"\u1EE19": "\u0636", |
|
"\u1EE39": "\u0636", |
|
"\u1EE59": "\u0636", |
|
"\u1EE79": "\u0636", |
|
"\u1EE99": "\u0636", |
|
"\u1EEB9": "\u0636", |
|
|
|
|
|
"\uFEC1": "\u0637", |
|
"\uFEC2": "\u0637", |
|
"\uFEC3": "\u0637", |
|
"\uFEC4": "\u0637", |
|
"\u1EE08": "\u0637", |
|
"\u1EE68": "\u0637", |
|
"\u1EE88": "\u0637", |
|
"\u1EEA8": "\u0637", |
|
|
|
|
|
"\uFEC5": "\u0638", |
|
"\uFEC6": "\u0638", |
|
"\uFEC7": "\u0638", |
|
"\uFEC8": "\u0638", |
|
"\u1EE1A": "\u0638", |
|
"\u1EE7A": "\u0638", |
|
"\u1EE9A": "\u0638", |
|
"\u1EEBA": "\u0638", |
|
|
|
|
|
"\uFEC9": "\u0639", |
|
"\uFECA": "\u0639", |
|
"\uFECB": "\u0639", |
|
"\uFECC": "\u0639", |
|
"\u1EE0F": "\u0639", |
|
"\u1EE2F": "\u0639", |
|
"\u1EE4F": "\u0639", |
|
"\u1EE6F": "\u0639", |
|
"\u1EE8F": "\u0639", |
|
"\u1EEAF": "\u0639", |
|
|
|
|
|
"\uFECD": "\u063A", |
|
"\uFECE": "\u063A", |
|
"\uFECF": "\u063A", |
|
"\uFED0": "\u063A", |
|
"\u1EE1B": "\u063A", |
|
"\u1EE3B": "\u063A", |
|
"\u1EE5B": "\u063A", |
|
"\u1EE7B": "\u063A", |
|
"\u1EE9B": "\u063A", |
|
"\u1EEBB": "\u063A", |
|
|
|
|
|
"\uFED1": "\u0641", |
|
"\uFED2": "\u0641", |
|
"\uFED3": "\u0641", |
|
"\uFED4": "\u0641", |
|
"\u1EE10": "\u0641", |
|
"\u1EE30": "\u0641", |
|
"\u1EE70": "\u0641", |
|
"\u1EE90": "\u0641", |
|
"\u1EEB0": "\u0641", |
|
|
|
|
|
"\uFED5": "\u0642", |
|
"\uFED6": "\u0642", |
|
"\uFED7": "\u0642", |
|
"\uFED8": "\u0642", |
|
"\u1EE12": "\u0642", |
|
"\u1EE32": "\u0642", |
|
"\u1EE52": "\u0642", |
|
"\u1EE72": "\u0642", |
|
"\u1EE92": "\u0642", |
|
"\u1EEB2": "\u0642", |
|
|
|
|
|
"\uFB8E": "\u06A9", |
|
"\uFB8F": "\u06A9", |
|
"\uFB90": "\u06A9", |
|
"\uFB91": "\u06A9", |
|
"\uFCC8": "\u06A9", |
|
"\u0643": "\u06A9", |
|
"\uFED9": "\u06A9", |
|
"\uFEDA": "\u06A9", |
|
"\uFEDB": "\u06A9", |
|
"\uFEDC": "\u06A9", |
|
"\u1EE0A": "\u06A9", |
|
"\u1EE2A": "\u06A9", |
|
"\u1EE6A": "\u06A9", |
|
|
|
|
|
"\uFB92": "\u06AF", |
|
"\uFB93": "\u06AF", |
|
"\uFB94": "\u06AF", |
|
"\uFB95": "\u06AF", |
|
|
|
|
|
"\uFCC9": "\u0644", |
|
"\uFEDD": "\u0644", |
|
"\uFEDE": "\u0644", |
|
"\uFEDF": "\u0644", |
|
"\uFEE0": "\u0644", |
|
"\u1EE0B": "\u0644", |
|
"\u1EE2B": "\u0644", |
|
"\u1EE4B": "\u0644", |
|
"\u1EE8B": "\u0644", |
|
"\u1EEAB": "\u0644", |
|
|
|
|
|
"\uFEE1": "\u0645", |
|
"\uFEE2": "\u0645", |
|
"\uFEE3": "\u0645", |
|
"\uFEE4": "\u0645", |
|
"\u1EE0C": "\u0645", |
|
"\u1EE2C": "\u0645", |
|
"\u1EE6C": "\u0645", |
|
"\u1EE8C": "\u0645", |
|
"\u1EEAC": "\u0645", |
|
|
|
|
|
"\uFEE5": "\u0646", |
|
"\uFEE6": "\u0646", |
|
"\uFEE7": "\u0646", |
|
"\uFEE8": "\u0646", |
|
"\u1EE0D": "\u0646", |
|
"\u1EE2D": "\u0646", |
|
"\u1EE4D": "\u0646", |
|
"\u1EE6D": "\u0646", |
|
"\u1EE8D": "\u0646", |
|
"\u1EEAD": "\u0646", |
|
|
|
|
|
"\u0677": "\u0648", |
|
"\uFEED": "\u0648", |
|
"\uFEEE": "\u0648", |
|
"\u06C6": "\u0648", |
|
"\u06C7": "\u0648", |
|
|
|
|
|
"\u06C0": "\u0647", |
|
"\u0629": "\u0647", |
|
"\u06BE": "\u0647", |
|
"\uFE93": "\u0647", |
|
"\u06D5": "\u0647", |
|
"\uFEE9": "\u0647", |
|
"\uFEEA": "\u0647", |
|
"\uFEEB": "\u0647", |
|
"\uFEEC": "\u0647", |
|
"\u1EE24": "\u0647", |
|
"\u1EE64": "\u0647", |
|
"\u1EE84": "\u0647", |
|
|
|
|
|
"\u06D0": "\u06CC", |
|
"\uFEEF": "\u06CC", |
|
"\uFEF3": "\u06CC", |
|
"\uFEF4": "\u06CC", |
|
"\u064A": "\u06CC", |
|
"\uFEF1": "\u06CC", |
|
"\u06CE": "\u06CC", |
|
"\uFBFD": "\u06CC", |
|
"\uFBFC": "\u06CC", |
|
"\uFBFE": "\u06CC", |
|
"\uFBFF": "\u06CC", |
|
"\uFEF0": "\u06CC", |
|
"\uFEF2": "\u06CC", |
|
"\u063D": "\u06CC", |
|
"\u063E": "\u06CC", |
|
"\u063F": "\u06CC", |
|
"\u06D2": "\u06CC", |
|
|
|
"\u064E": "", |
|
"\u064B": "", |
|
"\u064F": "", |
|
"\u064C": "", |
|
"\u0650": "", |
|
"\u064D": "", |
|
"\u0652": "", |
|
"\u0651": "", |
|
"\u0654": "", |
|
|
|
"0": "۰", |
|
"1": "۱", |
|
"2": "۲", |
|
"3": "۳", |
|
"4": "۴", |
|
"5": "۵", |
|
"6": "۶", |
|
"7": "۷", |
|
"8": "۸", |
|
"9": "۹", |
|
"٠": "۰", |
|
"١": "۱", |
|
"٢": "۲", |
|
"٣": "۳", |
|
"٤": "۴", |
|
"٥": "۵", |
|
"٦": "۶", |
|
"٧": "۷", |
|
"٨": "۸", |
|
"٩": "۹", |
|
|
|
"٬": "،", |
|
",": "،", |
|
";": "؛", |
|
"?": "؟", |
|
"\\": " ", |
|
"…": " غیره ", |
|
"%": " درصد ", |
|
"\u200e": " ", |
|
"\u200f": " ", |
|
"\u202a": " ", |
|
"\u202b": " ", |
|
"\u2066": " ", |
|
"\u2067": " ", |
|
"\u2069": " ", |
|
"\ufdef": " ", |
|
"\u00B7": ".", |
|
"\u2022": " ", |
|
|
|
"'": " ", |
|
"“": " ", |
|
"”": " ", |
|
"\u00ad": " ", |
|
"\u005f": " ", |
|
"\u002b": " ", |
|
"\u200b": " ", |
|
|
|
"\u00a9": " ", |
|
|
|
"\u2014": " ", |
|
"\u2019": " ", |
|
"\uFE0F": "", |
|
"\u007C": " ", |
|
} |
|
|
|
class KenlmModel: |
|
def __init__( |
|
self, |
|
vocabulary_size: str, |
|
ngram: str, |
|
pruning: str, |
|
map_to_farsi_alphabet: bool = True, |
|
normalize_nfd: bool = True, |
|
normalize_numbers: bool = True, |
|
remove_puctuation: bool = True, |
|
remove_non_farsi: bool = True, |
|
): |
|
self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing")) |
|
self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model")) |
|
|
|
norm_list = [] |
|
if map_to_farsi_alphabet: |
|
norm_list += [normalizers.Replace(key, value) for key, value in char_map.items()] |
|
if normalize_nfd: |
|
norm_list += [normalizers.NFD()] |
|
if normalize_numbers: |
|
norm_list += [normalizers.Replace(Regex("[۱۲۳۴۵۶۷۸۹]"), "۰")] |
|
if remove_puctuation: |
|
norm_list += [normalizers.Replace(Regex("[\\.!؛،؟]"), "")] |
|
if remove_non_farsi: |
|
norm_list += [normalizers.Replace(Regex("[^\u060c\u061b\u061f\u0622\u0623\u0624\u0626\u0627" |
|
"\u0628\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631" |
|
"\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a" |
|
"\u0641\u0642\u0644\u0645\u0646\u0647\u0648\u067e\u0686" |
|
"\u0698\u06a9\u06af\u06cc\u06f0\u06f1\u06f2\u06f3\u06f4" |
|
"\u06f5\u06f6\u06f7\u06f8\u06f9\\s\u200c\\.\\!]"), "")] |
|
norm_list += [normalizers.Strip()] |
|
|
|
self.normalizer = normalizers.Sequence(norm_list) |
|
|
|
@classmethod |
|
def from_pretrained( |
|
cls, |
|
vocabulary_size: str, |
|
ngram: str, |
|
pruning: str, |
|
map_to_farsi_alphabet: bool = True, |
|
normalize_nfd: bool = True, |
|
normalize_numbers: bool = True, |
|
remove_puctuation: bool = True, |
|
remove_non_farsi: bool = True, |
|
): |
|
return cls(vocabulary_size, |
|
ngram, |
|
pruning, |
|
map_to_farsi_alphabet, |
|
normalize_nfd, |
|
normalize_numbers, |
|
remove_puctuation, |
|
remove_non_farsi) |
|
|
|
def score(self, doc: str): |
|
doc = self.normalizer.normalize_str(doc) |
|
doc = ' '.join(self.tokenizer.encode(doc, out_type=str)) |
|
return self.model.score(doc) |
|
|
|
def perplexity(self, doc: str): |
|
doc = self.normalizer.normalize_str(doc) |
|
doc = ' '.join(self.tokenizer.encode(doc, out_type=str)) |
|
log_score = self.model.score(doc) |
|
length = len(doc.split()) + 1 |
|
return round(10.0 ** (-log_score / length), 1) |
|
|