from typing import TextIO import re import click import conllu import jsonlines @click.command(help="Extract a parallel corpus from a CoNLL-U file with translations") @click.argument("conllu_path", type=click.File("r")) @click.argument("output_path", type=click.File("w"), default="-") @click.option("--main-langcode", default="br", show_default=True) @click.option("--require-langcode", multiple=True, show_default=True) def main( conllu_path: TextIO, main_langcode: str, output_path: TextIO, require_langcode: list[str], ): with jsonlines.Writer(output_path) as out_stream: for tokenlist in conllu.parse_incr(conllu_path): if m := re.match(r"'?(?P[^/]+?)'?$", tokenlist.metadata["text"]): main_text = m.group("content") else: continue translations = { km.group("langcode"): kv.group("content") for k, v in tokenlist.metadata.items() if (km := re.match(r"text_(?P.*)", k)) and (kv := re.match(r"'?(?P[^/]+?)'?$", v)) } if not all(l in translations for l in require_langcode): continue out_stream.write( { "translation": { main_langcode: main_text, **translations, } } ) if __name__ == "__main__": main()