radiobee-aligner / tests /test_text2lists.py
freemt
Update slow-track for more lang pairs
7fd4e54
raw
history blame
1.89 kB
"""Test text2lists."""
from pathlib import Path
from radiobee.loadtext import loadtext
from radiobee.text2lists import text2lists
def test_text2lists_dual1():
"""Test text2lists data\test-dual.txt."""
filename = r"data\test-dual.txt"
text = loadtext(filename) # noqa
l1, l2 = text2lists(text)
assert l2[0] in [""]
assert "国际\n中\n双语"[:2] in l1[0]
assert '2021' in l2[5]
def test_shakespeare1000():
"""Separate first 1000.
from pathlib import Path
import zipfile
dir_loc = r""
filename = r"莎士比亚 - 莎士比亚全集(套装共39本 英汉双语)-外语教学与研究出版社 (2016).txt.zip"
zfile = zipfile.ZipFile(Path(dir_loc) / filename)
res_bytes = zfile.read(zfile.infolist()[0])
encoding = cchardet.detect(res_bytes).get("encoding")
text1000 = []
line = 0
numb_lines = 4000
for elm in res_bytes.splitlines():
if elm.decode(encoding).strip():
text1000.append(elm.decode(encoding))
if line >= numb_lines - 1:
break
line += 1
Path(f"data/shakespeare-zh-en-{numb_lines}.txt").write_text("\n".join(text1000), encoding="utf8")
tset = cmat2test(cmat)
df = pd.DataFrame(tset).rename(columns=dict(zip(range(0, 3), ['x', 'y', 'cos'])))
plot_df(df)
"""
# text1000a = Path("data/shakespeare-zh-en-1000.txt").read_text(encoding="utf8")
# text2000 = Path("data/shakespeare-zh-en-1000.txt").read_text(encoding="utf8")
text4000 = Path("data/shakespeare-zh-en-4000.txt").read_text(encoding="utf8")
# l1000a, l10002b = text2lists(text1000)
# l2000a, l2000b = text2lists(text2000)
l4000, r4000 = text2lists(text4000)
def test_test_dual2():
"""Test data/test-dual.txt."""
test_dual = Path("data/test-dual.txt").read_text(encoding="utf8")
l_dual, r_dual = text2lists(test_dual)