File size: 1,486 Bytes
eeaaffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from typing import TextIO
import re

import click
import conllu
import jsonlines


@click.command(help="Extract a parallel corpus from a CoNLL-U file with translations")
@click.argument("conllu_path", type=click.File("r"))
@click.argument("output_path", type=click.File("w"), default="-")
@click.option("--main-langcode", default="br", show_default=True)
@click.option("--require-langcode", multiple=True, show_default=True)
def main(
    conllu_path: TextIO,
    main_langcode: str,
    output_path: TextIO,
    require_langcode: list[str],
):
    with jsonlines.Writer(output_path) as out_stream:
        for tokenlist in conllu.parse_incr(conllu_path):
            if m := re.match(r"'?(?P<content>[^/]+?)'?$", tokenlist.metadata["text"]):
                main_text = m.group("content")
            else:
                continue
            translations = {
                km.group("langcode"): kv.group("content")
                for k, v in tokenlist.metadata.items()
                if (km := re.match(r"text_(?P<langcode>.*)", k))
                and (kv := re.match(r"'?(?P<content>[^/]+?)'?$", v))
            }
            if not all(l in translations for l in require_langcode):
                continue
            out_stream.write(
                {
                    "translation": {
                        main_langcode: main_text,
                        **translations,
                    }
                }
            )


if __name__ == "__main__":
    main()