proxectonos
/

Nos_MT-OpenNMT-gl-es

Galician

Model card Files Files and versions Community

imdbo commited on Mar 13, 2023

Commit

c7b0dc8

•

1 Parent(s): 3e9f026

remove comments and replace original paths with generic paths

Browse files

Files changed (1) hide show

bpe-gl-es_emb.yaml +37 -40

bpe-gl-es_emb.yaml CHANGED Viewed

@@ -1,77 +1,75 @@
 save_data: run
 ## Where the vocab(s) will be written
-src_vocab: run/vocab/es-gl/bpe.vocab.src
-tgt_vocab: run/vocab/es-gl/bpe.vocab.tgt
 overwrite: True
 # Corpus opts:
 data:
     europarl:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/europarl/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/europarl_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 120 #60 #120
     opensub:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/opensub/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/opensub_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 180 #900 #180
     dgt:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/dgt/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/dgt_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 18 #9 #18
     cluvi:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 40 # 4 #40
     opensub-es-gl:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 25 # 5 #25 #25
     ted2020:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 10 # 1 #10 #10
     corgaback:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/es_train.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
-        weight: 13 # 66  #14 #13
     ccmatrix:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/ccmatrix/es.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/ccmatrix/gl.txt
         transforms: [bpe, filtertoolong]
-        weight: 180 ##como opensub, tamanho semelhante
     resto:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/resto/es.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/resto/gl.txt
         transforms: [bpe, filtertoolong]
-        weight: 120 ##como europarl, tamanho semelhante
     opensub_2018:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/opensub_2018/es.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/opensub_2018/gl.txt
         transforms: [bpe, filtertoolong]
-        weight: 25 #igual que opensub_es-gl
     valid:
-        path_tgt: ../DGTcorpora_tokenized/es_gz/partitions/all-es_valid.txt
-        path_src: ../DGTcorpora_tokenized/es_gz/partitions_translit/all-gl_valid.txt
         transforms: [bpe, filtertoolong]
 ### Transform related opts:
 #### Subword
 tgt_subword_model: ./bpe/es.code
 src_subword_model: ./bpe/gl.code
-tgt_subword_vocab: ./run/vocab/es-gl/bpe.vocab.src
-src_subword_vocab: ./run/vocab/es-gl/bpe.vocab.tgt
-#tgt_subword_model: ../sentencepiece/en-gl/en.sp.model
-#src_subword_model: ../sentencepiece/en-gl/gl.sp.model
 src_subword_type: bpe
 tgt_subord_type: bpe
@@ -88,7 +86,7 @@ src_embeddings: ../embeddings/gl.emb.txt
 embeddings_type: "word2vec"
 # word_vec_size need to match with the pretrained embeddings dimensions
-word_vec_size: 300
 #### Filter
@@ -146,7 +144,6 @@ enc_layers: 6
 dec_layers: 6
 heads: 8
 rnn_size: 512
-word_vec_size: 512
 transformer_ff: 2048
 dropout_steps: [0]
 dropout: [0.1]

 save_data: run
 ## Where the vocab(s) will be written
+src_vocab: run/vocab/gl-es/bpe.vocab.src
+tgt_vocab: run/vocab/gl-es/bpe.vocab.tgt
 overwrite: True
 # Corpus opts:
 data:
     europarl:
+        path_tgt: corpora/europarl/partitions/es_train.txt
+        path_src: corpora/europarl_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 120
     opensub:
+        path_tgt: corpora/opensub/partitions/es_train.txt
+        path_src: corpora/opensub_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 180
     dgt:
+        path_tgt: corpora/dgt/partitions/es_train.txt
+        path_src: corpora/dgt_translit/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 18
     cluvi:
+        path_tgt: corpora/cluvi/partitions/es_train.txt
+        path_src: corpora/cluvi/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 40
     opensub-es-gl:
+        path_tgt: corpora/opensub-es-gl/partitions/es_train.txt
+        path_src: corpora/opensub-es-gl/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 25
     ted2020:
+        path_tgt: corpora/ted2020/partitions/es_train.txt
+        path_src: corpora/ted2020/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 10
     corgaback:
+        path_tgt: corpora/corgaback/partitions/es_train.txt
+        path_src: corpora/corgaback/partitions/gl_train.txt
         transforms: [bpe, filtertoolong]
+        weight: 13
     ccmatrix:
+        path_tgt: corpora/ccmatrix/es.txt
+        path_src: corpora/ccmatrix/gl.txt
         transforms: [bpe, filtertoolong]
+        weight: 180
     resto:
+        path_tgt: corpora/resto/es.txt
+        path_src: corpora/resto/gl.txt
         transforms: [bpe, filtertoolong]
+        weight: 120
     opensub_2018:
+        path_tgt: corpora/opensub_2018/es.txt
+        path_src: corpora/opensub_2018/gl.txt
         transforms: [bpe, filtertoolong]
+        weight: 25
     valid:
+        path_tgt: corpora/partitions/all-es_valid.txt
+        path_src: corpora/partitions_translit/all-gl_valid.txt
         transforms: [bpe, filtertoolong]
 ### Transform related opts:
 #### Subword
 tgt_subword_model: ./bpe/es.code
 src_subword_model: ./bpe/gl.code
+tgt_subword_vocab: ./run/vocab/gl-es/bpe.vocab.src
+src_subword_vocab: ./run/vocab/gl-es/bpe.vocab.tgt
 src_subword_type: bpe
 tgt_subord_type: bpe
 embeddings_type: "word2vec"
 # word_vec_size need to match with the pretrained embeddings dimensions
+word_vec_size: 512
 #### Filter
 dec_layers: 6
 heads: 8
 rnn_size: 512
 transformer_ff: 2048
 dropout_steps: [0]
 dropout: [0.1]