lgrobol
/

m2m100_418M_br_fr

text2text-generation

Inference Endpoints

Model card Files Files and versions Community

m2m100_418M_br_fr / extract_sents.py

lgrobol's picture

update to new dataset version

eeaaffa verified almost 2 years ago

history blame contribute delete

No virus

1.49 kB

	from typing import TextIO
	import re

	import click
	import conllu
	import jsonlines


	@click.command(help="Extract a parallel corpus from a CoNLL-U file with translations")
	@click.argument("conllu_path", type=click.File("r"))
	@click.argument("output_path", type=click.File("w"), default="-")
	@click.option("--main-langcode", default="br", show_default=True)
	@click.option("--require-langcode", multiple=True, show_default=True)
	def main(
	conllu_path: TextIO,
	main_langcode: str,
	output_path: TextIO,
	require_langcode: list[str],
	):
	with jsonlines.Writer(output_path) as out_stream:
	for tokenlist in conllu.parse_incr(conllu_path):
	if m := re.match(r"'?(?P<content>[^/]+?)'?$", tokenlist.metadata["text"]):
	main_text = m.group("content")
	else:
	continue
	translations = {
	km.group("langcode"): kv.group("content")
	for k, v in tokenlist.metadata.items()
	if (km := re.match(r"text_(?P<langcode>.*)", k))
	and (kv := re.match(r"'?(?P<content>[^/]+?)'?$", v))
	}
	if not all(l in translations for l in require_langcode):
	continue
	out_stream.write(
	{
	"translation": {
	main_langcode: main_text,
	**translations,
	}
	}
	)


	if __name__ == "__main__":
	main()