|
common: |
|
output_directory: local/opus |
|
|
|
steps: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- type: opus_read |
|
parameters: |
|
corpus_name: OfisPublik |
|
source_language: br |
|
target_language: fr |
|
release: latest |
|
preprocessing: raw |
|
src_output: ofis.br.gz |
|
tgt_output: ofis.fr.gz |
|
suppress_prompts: true |
|
|
|
- type: opus_read |
|
parameters: |
|
corpus_name: OpenSubtitles |
|
source_language: br |
|
target_language: fr |
|
release: latest |
|
preprocessing: raw |
|
src_output: ost.br.gz |
|
tgt_output: ost.fr.gz |
|
suppress_prompts: true |
|
|
|
- type: opus_read |
|
parameters: |
|
corpus_name: Tatoeba |
|
source_language: br |
|
target_language: fr |
|
release: latest |
|
preprocessing: raw |
|
src_output: tatoeba.br.gz |
|
tgt_output: tatoeba.fr.gz |
|
suppress_prompts: true |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- ofis.br.gz |
|
- tatoeba.br.gz |
|
output: good.br.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- ofis.fr.gz |
|
- tatoeba.fr.gz |
|
output: good.fr.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
|
|
|
|
|
|
|
|
|
|
|
|
- ost.br.gz |
|
output: dubious.br.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
|
|
|
|
|
|
|
|
|
|
|
|
- ost.fr.gz |
|
output: dubious.fr.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- dubious.br.gz |
|
- good.br.gz |
|
output: align_train.br.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- dubious.fr.gz |
|
- good.fr.gz |
|
output: align_train.fr.gz |
|
|
|
- type: filter |
|
parameters: |
|
inputs: |
|
- align_train.br.gz |
|
- align_train.fr.gz |
|
outputs: |
|
- align_train-filtered.br.gz |
|
- align_train-filtered.fr.gz |
|
filters: |
|
- LengthFilter: |
|
unit: word |
|
min_length: 1 |
|
max_length: 128 |
|
|
|
- type: train_alignment |
|
parameters: |
|
src_data: align_train-filtered.br.gz |
|
tgt_data: align_train-filtered.fr.gz |
|
output: alignment.priors |
|
parameters: {} |
|
|
|
|
|
- type: filter |
|
parameters: |
|
inputs: |
|
- dubious.br.gz |
|
- dubious.fr.gz |
|
outputs: |
|
- dubious-filtered.br.gz |
|
- dubious-filtered.fr.gz |
|
filters: |
|
- LengthFilter: |
|
unit: word |
|
min_length: 4 |
|
max_length: 128 |
|
- WordAlignFilter: |
|
priors: alignment.priors |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- dubious-filtered.br.gz |
|
- good.br.gz |
|
output: all.br.gz |
|
|
|
- type: concatenate |
|
parameters: |
|
inputs: |
|
- dubious-filtered.fr.gz |
|
- good.fr.gz |
|
output: all.fr.gz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- type: filter |
|
parameters: |
|
inputs: |
|
- all.br.gz |
|
- all.fr.gz |
|
outputs: |
|
- filtered.br.gz |
|
- filtered.fr.gz |
|
filters: &myfilters |
|
- LengthFilter: |
|
unit: word |
|
min_length: 1 |
|
max_length: 128 |
|
|
|
- LengthRatioFilter: |
|
unit: word |
|
threshold: 3 |
|
|
|
- NonZeroNumeralsFilter: {} |
|
- AlphabetRatioFilter: {} |
|
- SimilarityFilter: {} |
|
- RepetitionFilter: |
|
threshold: 3 |
|
min_length: 5 |
|
max_length: 128 |
|
|
|
|