yuancwang
init
b725c5a
raw
history blame
2.37 kB
# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
# ## Citations
# ```bibtex
# @inproceedings{yao2021wenet,
# title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
# author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
# booktitle={Proc. Interspeech},
# year={2021},
# address={Brno, Czech Republic },
# organization={IEEE}
# }
# @article{zhang2022wenet,
# title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
# author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
# journal={arXiv preprint arXiv:2203.15455},
# year={2022}
# }
#
import re
def read_lists(list_file):
lists = []
with open(list_file, "r", encoding="utf8") as fin:
for line in fin:
lists.append(line.strip())
return lists
def read_non_lang_symbols(non_lang_sym_path):
"""read non-linguistic symbol from file.
The file format is like below:
{NOISE}\n
{BRK}\n
...
Args:
non_lang_sym_path: non-linguistic symbol file path, None means no any
syms.
"""
if non_lang_sym_path is None:
return None
else:
syms = read_lists(non_lang_sym_path)
non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
for sym in syms:
if non_lang_syms_pattern.fullmatch(sym) is None:
class BadSymbolFormat(Exception):
pass
raise BadSymbolFormat(
"Non-linguistic symbols should be "
"formatted in {xxx}/<xxx>/[xxx], consider"
" modify '%s' to meet the requirment. "
"More details can be found in discussions here : "
"https://github.com/wenet-e2e/wenet/pull/819" % (sym)
)
return syms
def read_symbol_table(symbol_table_file):
symbol_table = {}
with open(symbol_table_file, "r", encoding="utf8") as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
symbol_table[arr[0]] = int(arr[1])
return symbol_table