common: output_directory: local/opus steps: # The quality of wikimatrix is really really bad for this pair: very poor alignment # - type: opus_read # parameters: # corpus_name: WikiMatrix # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: wiki.br.gz # tgt_output: wiki.fr.gz # The quality of ccmatrix is really really bad for this pair: very few usable breton sentences # - type: opus_read # parameters: # corpus_name: MultiCCAligned # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: cc.br.gz # tgt_output: cc.fr.gz - type: opus_read parameters: corpus_name: OfisPublik source_language: br target_language: fr release: latest preprocessing: raw src_output: ofis.br.gz tgt_output: ofis.fr.gz suppress_prompts: true - type: opus_read parameters: corpus_name: OpenSubtitles source_language: br target_language: fr release: latest preprocessing: raw src_output: ost.br.gz tgt_output: ost.fr.gz suppress_prompts: true - type: opus_read parameters: corpus_name: Tatoeba source_language: br target_language: fr release: latest preprocessing: raw src_output: tatoeba.br.gz tgt_output: tatoeba.fr.gz suppress_prompts: true # - type: opus_read # parameters: # corpus_name: wikimedia # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: wikimedia.br.gz # tgt_output: wikimedia.fr.gz # suppress_prompts: true # - type: opus_read # parameters: # corpus_name: Mozilla-I10n # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: mozilla.br.gz # tgt_output: mozilla.fr.gz # suppress_prompts: true # - type: opus_read # parameters: # corpus_name: KDE4 # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: kde.br.gz # tgt_output: kde.fr.gz # suppress_prompts: true # - type: opus_read # parameters: # corpus_name: GNOME # source_language: br # target_language: fr # release: latest # preprocessing: raw # src_output: gnome.br.gz # tgt_output: gnome.fr.gz # suppress_prompts: true - type: concatenate parameters: inputs: - ofis.br.gz - tatoeba.br.gz output: good.br.gz - type: concatenate parameters: inputs: - ofis.fr.gz - tatoeba.fr.gz output: good.fr.gz - type: concatenate parameters: inputs: # - wiki.br.gz # - cc.br.gz # - wikimedia.br.gz # - gnome.br.gz # - kde.br.gz # - mozilla.br.gz - ost.br.gz output: dubious.br.gz - type: concatenate parameters: inputs: # - wiki.fr.gz # - cc.fr.gz # - wikimedia.fr.gz # - gnome.fr.gz # - kde.fr.gz # - mozilla.fr.gz - ost.fr.gz output: dubious.fr.gz - type: concatenate parameters: inputs: - dubious.br.gz - good.br.gz output: align_train.br.gz - type: concatenate parameters: inputs: - dubious.fr.gz - good.fr.gz output: align_train.fr.gz - type: filter parameters: inputs: - align_train.br.gz - align_train.fr.gz outputs: - align_train-filtered.br.gz - align_train-filtered.fr.gz filters: - LengthFilter: unit: word min_length: 1 max_length: 128 - type: train_alignment parameters: src_data: align_train-filtered.br.gz tgt_data: align_train-filtered.fr.gz output: alignment.priors parameters: {} # TODO: dedup and more agressive filtering - type: filter parameters: inputs: - dubious.br.gz - dubious.fr.gz outputs: - dubious-filtered.br.gz - dubious-filtered.fr.gz filters: - LengthFilter: unit: word min_length: 4 max_length: 128 - WordAlignFilter: priors: alignment.priors - type: concatenate parameters: inputs: - dubious-filtered.br.gz - good.br.gz output: all.br.gz - type: concatenate parameters: inputs: - dubious-filtered.fr.gz - good.fr.gz output: all.fr.gz # - type: remove_duplicates # parameters: # inputs: # - all.br.gz # outputs: # - dedup.br.gz # - type: remove_duplicates # parameters: # inputs: # - all.fr.gz # outputs: # - dedup.fr.gz - type: filter parameters: inputs: - all.br.gz - all.fr.gz outputs: - filtered.br.gz - filtered.fr.gz filters: &myfilters - LengthFilter: unit: word min_length: 1 max_length: 128 - LengthRatioFilter: unit: word threshold: 3 - NonZeroNumeralsFilter: {} - AlphabetRatioFilter: {} - SimilarityFilter: {} - RepetitionFilter: threshold: 3 min_length: 5 max_length: 128