|
from typing import TextIO |
|
import re |
|
|
|
import click |
|
import conllu |
|
import jsonlines |
|
|
|
|
|
@click.command(help="Extract a parallel corpus from a CoNLL-U file with translations") |
|
@click.argument("conllu_path", type=click.File("r")) |
|
@click.argument("output_path", type=click.File("w"), default="-") |
|
@click.option("--main-langcode", default="br", show_default=True) |
|
@click.option("--require-langcode", multiple=True, show_default=True) |
|
def main( |
|
conllu_path: TextIO, |
|
main_langcode: str, |
|
output_path: TextIO, |
|
require_langcode: list[str], |
|
): |
|
with jsonlines.Writer(output_path) as out_stream: |
|
for tokenlist in conllu.parse_incr(conllu_path): |
|
if m := re.match(r"'?(?P<content>[^/]+?)'?$", tokenlist.metadata["text"]): |
|
main_text = m.group("content") |
|
else: |
|
continue |
|
translations = { |
|
km.group("langcode"): kv.group("content") |
|
for k, v in tokenlist.metadata.items() |
|
if (km := re.match(r"text_(?P<langcode>.*)", k)) |
|
and (kv := re.match(r"'?(?P<content>[^/]+?)'?$", v)) |
|
} |
|
if not all(l in translations for l in require_langcode): |
|
continue |
|
out_stream.write( |
|
{ |
|
"translation": { |
|
main_langcode: main_text, |
|
**translations, |
|
} |
|
} |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|