imdbo commited on
Commit
8583892
1 Parent(s): 90541c3

remove comments and replace original paths with generic paths

Browse files
Files changed (1) hide show
  1. bpe-en-gl_emb.yaml +24 -44
bpe-en-gl_emb.yaml CHANGED
@@ -8,63 +8,48 @@ overwrite: True
8
  # Corpus opts:
9
  data:
10
  europarl:
11
- path_src: ../DGTcorpora_tokenized/en_gl/europarl/partitions/en_train.txt
12
- path_tgt: ../DGTcorpora_tokenized/en_gl/europarl/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
  weight: 120
15
  opensub:
16
- path_src: ../DGTcorpora_tokenized/en_gl/opensub/partitions/en_train.txt
17
- path_tgt: ../DGTcorpora_tokenized/en_gl/opensub/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
  weight: 152
20
  opus:
21
- path_src: ../DGTcorpora_tokenized/en_gl/opus/partitions/en_train.txt
22
- path_tgt: ../DGTcorpora_tokenized/en_gl/opus/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
  weight: 160
25
  ted2020:
26
- path_src: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/en_train.txt
27
- path_tgt: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
  weight: 10
30
  corgaback:
31
- path_src: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/en_train.txt
32
- path_tgt: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
  weight: 15
35
  ccmatrix:
36
- path_src: ../DGTcorpora_tokenized/en_gl/ccmatrix/en_tok_dbo.txt
37
- path_tgt: ../DGTcorpora_tokenized/en_gl/ccmatrix/gl_tok_dbo.txt
38
  transforms: [bpe, filtertoolong]
39
- weight: 380 ##75 ## 25000000/13000000 = 2; 760/2 = 380 * 5 = 1900 (380/5=75)
40
  wikimatrix:
41
- path_src: ../DGTcorpora_tokenized/en_gl/wikimatrix/en.txt
42
- path_tgt: ../DGTcorpora_tokenized/en_gl/wikimatrix/gl.txt
43
  transforms: [bpe, filtertoolong]
44
- weight: 70 #25000000/450000 = 55 ; 760/55 = 14 ; 14 * 5 = 70
45
  cluvi:
46
- path_src: ../DGTcorpora_tokenized/en_gl/cluvi/en.txt
47
- path_tgt: ../DGTcorpora_tokenized/en_gl/cluvi/gl.txt
48
  transforms: [bpe, filtertoolong]
49
- weight: 70 #25000000/295000 = 84 ; 760/84 = 9 ; 9 * 10 = 90
50
- #wikimedia:
51
- # path_src: ../DGTcorpora_tokenized/en_gl/wikimedia/en.txt
52
- #path_tgt: ../DGTcorpora_tokenized/en_gl/wikimedia/gl.txt
53
- #transforms: [bpe, filtertoolong]
54
- #weight: 4
55
- # xlent:
56
- #path_src: ../DGTcorpora_tokenized/en_gl/xlent/en.txt
57
- #path_tgt: ../DGTcorpora_tokenized/en_gl/xlent/gl.txt
58
- #transforms: [bpe, filtertoolong]
59
- #weight: 50 #25000000/1600000=15; 760/15=50
60
- #linux:
61
- #path_src: ../DGTcorpora_tokenized/en_gl/linux/en.txt
62
- #path_tgt: ../DGTcorpora_tokenized/en_gl/linux/gl.txt
63
- #transforms: [bpe, filtertoolong]
64
- #weight: 20 #25000000/150000=166; 760/166=5 * 5 = 20
65
  valid:
66
- path_src: ../DGTcorpora_tokenized/en_gl/partitions/all-en_valid.txt
67
- path_tgt: ../DGTcorpora_tokenized/en_gl/partitions/all-gl_valid.txt
68
  transforms: [bpe, filtertoolong]
69
 
70
  ### Transform related opts:
@@ -73,8 +58,7 @@ src_subword_model: ./bpe/en.code
73
  tgt_subword_model: ./bpe/gl.code
74
  src_subword_vocab: ./run/bpe.vocab.src
75
  tgt_subword_vocab: ./run/bpe.vocab.tgt
76
- #src_subword_model: ../sentencepiece/en-gl/en.sp.model
77
- #tgt_subword_model: ../sentencepiece/en-gl/gl.sp.model
78
  src_subword_type: bpe
79
  tgt_subord_type: bpe
80
 
@@ -97,7 +81,7 @@ tgt_embeddings: ../embeddings/gl.emb.txt
97
  embeddings_type: "word2vec"
98
 
99
  # word_vec_size need to match with the pretrained embeddings dimensions
100
- word_vec_size: 300
101
 
102
 
103
 
@@ -146,13 +130,9 @@ enc_layers: 6
146
  dec_layers: 6
147
  heads: 8
148
  rnn_size: 512
149
- word_vec_size: 512
150
  transformer_ff: 2048
151
  dropout_steps: [0]
152
  dropout: [0.1]
153
  attention_dropout: [0.1]
154
  share_decoder_embeddings: true
155
- share_embeddings: false
156
-
157
-
158
-
 
8
  # Corpus opts:
9
  data:
10
  europarl:
11
+ path_src: corpora/europarl/partitions/en_train.txt
12
+ path_tgt: corpora/europarl/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
  weight: 120
15
  opensub:
16
+ path_src: corpora/opensub/partitions/en_train.txt
17
+ path_tgt: corpora/opensub/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
  weight: 152
20
  opus:
21
+ path_src: corpora/opus/partitions/en_train.txt
22
+ path_tgt: corpora/opus/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
  weight: 160
25
  ted2020:
26
+ path_src: corpora/ted2020/partitions/en_train.txt
27
+ path_tgt: corpora/ted2020/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
  weight: 10
30
  corgaback:
31
+ path_src: corpora/corgaback/partitions/en_train.txt
32
+ path_tgt: corpora/corgaback/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
  weight: 15
35
  ccmatrix:
36
+ path_src: corpora/ccmatrix/en_tok_dbo.txt
37
+ path_tgt: corpora/ccmatrix/gl_tok_dbo.txt
38
  transforms: [bpe, filtertoolong]
39
+ weight: 380
40
  wikimatrix:
41
+ path_src: corpora/wikimatrix/en.txt
42
+ path_tgt: corpora/wikimatrix/gl.txt
43
  transforms: [bpe, filtertoolong]
44
+ weight: 70
45
  cluvi:
46
+ path_src: corpora/cluvi/en.txt
47
+ path_tgt: corpora/cluvi/gl.txt
48
  transforms: [bpe, filtertoolong]
49
+ weight: 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  valid:
51
+ path_src: corpora/partitions/all-en_valid.txt
52
+ path_tgt: corpora/partitions/all-gl_valid.txt
53
  transforms: [bpe, filtertoolong]
54
 
55
  ### Transform related opts:
 
58
  tgt_subword_model: ./bpe/gl.code
59
  src_subword_vocab: ./run/bpe.vocab.src
60
  tgt_subword_vocab: ./run/bpe.vocab.tgt
61
+
 
62
  src_subword_type: bpe
63
  tgt_subord_type: bpe
64
 
 
81
  embeddings_type: "word2vec"
82
 
83
  # word_vec_size need to match with the pretrained embeddings dimensions
84
+ word_vec_size: 512
85
 
86
 
87
 
 
130
  dec_layers: 6
131
  heads: 8
132
  rnn_size: 512
 
133
  transformer_ff: 2048
134
  dropout_steps: [0]
135
  dropout: [0.1]
136
  attention_dropout: [0.1]
137
  share_decoder_embeddings: true
138
+ share_embeddings: false