m2m100_418M_br_fr / extract_sents.py
lgrobol's picture
update to new dataset version
eeaaffa verified
raw
history blame
1.49 kB
from typing import TextIO
import re
import click
import conllu
import jsonlines
@click.command(help="Extract a parallel corpus from a CoNLL-U file with translations")
@click.argument("conllu_path", type=click.File("r"))
@click.argument("output_path", type=click.File("w"), default="-")
@click.option("--main-langcode", default="br", show_default=True)
@click.option("--require-langcode", multiple=True, show_default=True)
def main(
conllu_path: TextIO,
main_langcode: str,
output_path: TextIO,
require_langcode: list[str],
):
with jsonlines.Writer(output_path) as out_stream:
for tokenlist in conllu.parse_incr(conllu_path):
if m := re.match(r"'?(?P<content>[^/]+?)'?$", tokenlist.metadata["text"]):
main_text = m.group("content")
else:
continue
translations = {
km.group("langcode"): kv.group("content")
for k, v in tokenlist.metadata.items()
if (km := re.match(r"text_(?P<langcode>.*)", k))
and (kv := re.match(r"'?(?P<content>[^/]+?)'?$", v))
}
if not all(l in translations for l in require_langcode):
continue
out_stream.write(
{
"translation": {
main_langcode: main_text,
**translations,
}
}
)
if __name__ == "__main__":
main()