In [33]:
def normalize_punctuations(line: str) -> str:
    # Replace all English punctuations with Chinese ones
    line = line.replace(",", "，").replace("!", "！").replace("?", "？")\
        .replace(":", "：").replace(";", "；").replace("(", "（").replace(")", "）")
    return line

In [34]:
import re
from functools import reduce

can_sentence_start = re.compile(r"[0-9]*hz ")
can_lines = []

with open("train/abc/abc_cantonese_index_00001_to_04587_line_1_to_4575.xml", "r") as abc_file1,\
open("train/abc/abc_cantonese_index_04588_to_09175_line_4576_to_9150.xml", "r") as abc_file2,\
    open("train/abc/abc_cantonese_index_09176_to_13775_line_9151_to_13725.xml", "r") as abc_file3,\
        open("train/abc/abc_cantonese_index_13776_to_FE99FD5B4E37BE32_line_13726_to_18302.xml", "r") as abc_file4:
    lines = reduce(lambda lines, file: lines + file.read().splitlines(), [abc_file1, abc_file2, abc_file3, abc_file4], [])
    for line in lines:
        match = can_sentence_start.match(line)
        if match and not "(empty band???)" in line:
            line = line[match.end():].strip()
            if len(line) >= 5:
                can_lines.append(normalize_punctuations(line))

print("Got {} Cantonese sentences with length >= 5".format(len(can_lines)))

Got 14838 Cantonese sentences with length >= 5


In [35]:
common_can_charset = set()

with open("train/wordshk.can", "r") as wordshk_file:
    for c in wordshk_file.read():
        common_can_charset.add(c)

print(f"Found {len(common_can_charset)} common Cantonese characters")

Found 4527 common Cantonese characters


In [36]:
from collections import defaultdict

rare_can_charset = defaultdict(int)
for line in can_lines:
    for c in line:
        if not c in common_can_charset:
            rare_can_charset[c] += 1

print(f"Found {len(rare_can_charset)} rare Cantonese characters")

charset_sort_by_freq = dict(sorted(rare_can_charset.items(), key=lambda item: -item[1]))
for c, freq in charset_sort_by_freq.items():
    print(c, freq)

Found 365 rare Cantonese characters
𠹺 388
噖 162
𡁵 157
𠶧 88
嚫 88
屘 57
衭 47
贃 43
說 35
𧵳 30
歳 27
𢫏 27
𨶙 25
癐 25
𦡆 25
𨃩 24
况 21
内 19
𢵌 19
𦧺 18
𠹌 18
爲 16
𢱑 16
𡁯 15
𠱓 14
𠵿 14
踹 13
㗇 13
𠾴 13
嗍 13
𧘹 13
𠹳 12
𠹭 12
脫 12
䁪 11
𧨾 11
掬 11
𠸐 11
啥 11
𠱃 10
噔 10
捹 10
𠹻 10
𠼻 10
噠 10
𨳊 10
𢲲 9
𨉖 9
躭 9
䠋 9
嘮 9
啽 9
滮 8
㧻 8
𧶄 8
𦛚 8
撠 8
呡 8
睸 8
𠰲 8
𥔿 8
唎 8
𠸊 8
𬜐 8
蔥 8
呱 8
Ｂ 7
𢯊 7
𫫃 7
𢝵 7
銹 7
㓤 7
䁯 7
啉 7
臥 7
𠓼 7
稅 7
 7
喴 7
噱 7
衛 6
𡄯 6
揤 6
𢤹 6
 6
鷄 6
湴 6
 6
𦣇 6
齧 6
𠮨 6
 6
𡀝 6
婄 6
𠼱 6
𠱂 5
磧 5
𠰋 5
𡂖 5
浭 5
擏 5
𥋇 5
揢 5
㨆 5
𠾍 5
兌 5
𢺳 5
坺 5
鍚 5
𣘚 5
𪘁 5
𨳍 5
嗙 5
𠼰 5
𨳒 4
唿 4
𣳼 4
𦂥 4
溚 4
囋 4
瀄 4
𠌥 4
𢫦 4
𢶍 4
𠲵 4
䉺 4
炕 4
𢴈 4
𡲢 4
𥅈 4
𬧊 4
簕 4
査 4
𩜠 4
𫬿 4
𠜱 4
嚬 4
𠹹 4
𦉘 4
唦 4
㨘 4
𡄽 4
熗 4
𡁷 4
𠿬 4
咜 4
𠸏 4
𡁸 4
𡃵 4
𪚩 4
Ｄ 4
Ｑ 4
𨆯 3
啗 3
蔸 3
舗 3
囪 3
艔 3
洩 3
𢵧 3
菓 3
䪴 3
䆲 3
痱 3
趿 3
𠮩 3
搉 3
矋 3
𠻗 3
𢲈 3
潞 3
沬 3
揇 3
齃 3
𡃤 3
𡃶 3
瀟 3
軨 3
鉻 3
 3
㿭 3
𢵄 3
㗲 3
𢫕 3
𢰸 3
葫 3
咔 3
嚎 3
嗿 3
咈 3
咾 3
 3
𠵈 3
吥 3
𠾭 3
𠾵 3
朘 3
觥 3
㩧 2
焙 2
兀 2
䭤 2
饊 2
[ 2
] 2
炖 2
争 2
䁓 2
𡂝 2
𩬎 2
鈒 2
亁 2
炠 2
摼 2
𠺬 2
𠵉 2
蝄 2
 2
蔫 2
㘉 2
荏 2
墘 2
嗏 2
呣 2
曚 2
壬 2
揅 2
溼 2
囓 2

In [46]:
char_to_normalized_char = {}

with open("zh_char2str_mapping.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [c, n] = line.split("\t")
        char_to_normalized_char[c] = n

print("Found {} normalized mappings".format(len(char_to_normalized_char)))

Found 12360 normalized mappings


In [49]:
for c, freq in charset_sort_by_freq.items():
    if c in char_to_normalized_char:
        print(c + "\t" + char_to_normalized_char[c] + "\t" + str(freq))
    else:
        print(c + "\t" + "???" + "\t" + str(freq))

𠹺	埋	388
噖	琴	162
𡁵	緊	157
𠶧	掂	88
嚫	親	88
屘	尾	57
衭	衤夫	47
贃	賺	43
說	???	35
𧵳	???	30
歳	歲	27
𢫏	全	27
𨶙	能	25
癐	???	25
𦡆	???	25
𨃩	⻊扇	24
况	???	21
内	內	19
𢵌	扌隊	19
𦧺	賴	18
𠹌	 o能	18
爲	為	16
𢱑	抓	16
𡁯	???	15
𠱓	詭	14
𠵿	披	14
踹	???	13
㗇	???	13
𠾴	棒	13
嗍	索	13
𧘹	太	13
𠹳	傑	12
𠹭	???	12
脫	???	12
䁪	???	11
𧨾	氹	11
掬	???	11
𠸐	???	11
啥	???	11
𠱃	 o凹	10
噔	 o登	10
捹	扌奔	10
𠹻	???	10
𠼻	基	10
噠	???	10
𨳊	九	10
𢲲	???	9
𨉖	???	9
躭	耽	9
䠋	卑	9
嘮	???	9
啽	 o弇	9
滮	氵彪	8
㧻	扌涿	8
𧶄	???	8
𦛚	???	8
撠	扌戟	8
呡	 o吻	8
睸	目眉	8
𠰲	???	8
𥔿	???	8
唎	脷	8
𠸊	???	8
𬜐	???	8
蔥	葱	8
呱	???	8
Ｂ	???	7
𢯊	扌的	7
𫫃	???	7
𢝵	???	7
銹	鏽	7
㓤	吉刂	7
䁯	???	7
啉	 o林	7
臥	???	7
𠓼	???	7
稅	???	7
	???	7
喴	 o威	7
噱	???	7
衛	???	6
𡄯	???	6
揤	扌即	6
𢤹	???	6
	???	6
鷄	雞	6
湴	氵並	6
	???	6
𦣇	???	6
齧	咬	6
𠮨	乃	6
	???	6
𡀝	???	6
婄	蓓	6
𠼱	累	6
𠱂	???	5
磧	石責	5
𠰋	???	5
𡂖	???	5
浭	氵更	5
擏	擎	5
𥋇	掌	5
揢	扌客	5
㨆	扌林	5
𠾍	棄	5
兌	???	5
𢺳	???	5
坺	土拔	5
鍚	???	5
𣘚	???	5
𪘁	???	5
𨳍	七	5
嗙	 o旁	5
𠼰	???	5
𨳒	小	4
唿	篋	4
𣳼	???	4
𦂥	???	4
溚	塔	4
囋	???	4
瀄	吱	4
𠌥	???	4
𢫦	???	4
𢶍	???	4
𠲵	???	4
䉺	米	4
炕	???	4
𢴈	撻	4
𡲢	???	4
𥅈	立	4
𬧊	???	4
簕	勒	4
査	

In [57]:
abc_mapping = {}

with open("abc_rare_char_mapping.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [c, n, freq] = line.split("\t")
        if len(n) == 1:
            abc_mapping[c] = n

print("Loaded {} normalization mappings".format(len(abc_mapping)))
print("Sample of first 10 highest frequency mappings:")
print(list(abc_mapping.items())[:10])

Loaded 177 normalization mappings
Sample of first 10 highest frequency mappings:
[('𠹺', '埋'), ('噖', '琴'), ('𡁵', '緊'), ('𠶧', '掂'), ('嚫', '親'), ('屘', '尾'), ('衭', '褲'), ('贃', '賺'), ('說', '説'), ('𧵳', '蝕')]


In [58]:
# replace all occurence of rare characters with normalized ones
def normalize_abc(line: str) -> str:
    for c, n in abc_mapping.items():
        line = line.replace(c, n)
    line = line.replace("而𠺢", "而家").replace("依𠺢", "依家")
    return line

In [59]:
with open("train/abc.can", "w+") as output_file:
    for line in can_lines:
        output_file.write(normalize_abc(line) + "\n")
