initial commit
Browse files- avg5.ckpt +3 -0
- config.yaml +131 -0
- hyp.test +0 -0
- sp.model +3 -0
- src_vocab.txt +0 -0
- test.de-en.tsv +0 -0
- test.log +42 -0
- train.log +0 -0
- trg_vocab.txt +0 -0
- validations.txt +29 -0
avg5.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16785664244d8110805afb1ced49b7f8fc5aec327e1ac6cd2024f4bfd3c0f55a
|
3 |
+
size 2812902554
|
config.yaml
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "iwslt14_deenfr_prompt"
|
2 |
+
joeynmt_version: "2.3.0"
|
3 |
+
model_dir: "iwslt14_prompt"
|
4 |
+
use_cuda: True
|
5 |
+
fp16: True
|
6 |
+
random_seed: 42
|
7 |
+
|
8 |
+
data:
|
9 |
+
#train: "iwslt14_prompt/train" # cf. https://wit3.fbk.eu/2014-01
|
10 |
+
#dev: "iwslt14_prompt/dev"
|
11 |
+
test: "iwslt14_prompt/test.de-en" # ['TED.dev2010', 'TEDX.dev2012', 'TED.tst2010', 'TED.tst2011', 'TED.tst2012']
|
12 |
+
dataset_type: "tsv"
|
13 |
+
sample_dev_subset: 500
|
14 |
+
src:
|
15 |
+
lang: "src"
|
16 |
+
max_length: 512
|
17 |
+
lowercase: False
|
18 |
+
normalize: False
|
19 |
+
level: "bpe"
|
20 |
+
voc_limit: 32000
|
21 |
+
voc_min_freq: 1
|
22 |
+
voc_file: "iwslt14_prompt/src_vocab.txt"
|
23 |
+
tokenizer_type: "sentencepiece"
|
24 |
+
tokenizer_cfg:
|
25 |
+
model_file: "iwslt14_prompt/sp.model"
|
26 |
+
model_type: "unigram"
|
27 |
+
character_coverage: 1.0
|
28 |
+
trg:
|
29 |
+
lang: "trg"
|
30 |
+
max_length: 512
|
31 |
+
lowercase: False
|
32 |
+
normalize: False
|
33 |
+
level: "bpe"
|
34 |
+
voc_limit: 32000
|
35 |
+
voc_min_freq: 1
|
36 |
+
voc_file: "iwslt14_prompt/trg_vocab.txt"
|
37 |
+
tokenizer_type: "sentencepiece"
|
38 |
+
tokenizer_cfg:
|
39 |
+
model_file: "iwslt14_prompt/sp.model"
|
40 |
+
model_type: "unigram"
|
41 |
+
character_coverage: 1.0
|
42 |
+
special_symbols:
|
43 |
+
unk_token: "<unk>"
|
44 |
+
unk_id: 0
|
45 |
+
pad_token: "<pad>"
|
46 |
+
pad_id: 1
|
47 |
+
bos_token: "<s>"
|
48 |
+
bos_id: 2
|
49 |
+
eos_token: "</s>"
|
50 |
+
eos_id: 3
|
51 |
+
sep_token: "<sep>"
|
52 |
+
sep_id: 4
|
53 |
+
lang_tags: ["<de>", "<en>", "<fr>"]
|
54 |
+
|
55 |
+
testing:
|
56 |
+
load_model: "iwslt14_prompt/avg5.ckpt"
|
57 |
+
n_best: 1
|
58 |
+
beam_size: 5
|
59 |
+
beam_alpha: 1.0
|
60 |
+
batch_size: 32
|
61 |
+
batch_type: "sentence"
|
62 |
+
max_output_length: 512
|
63 |
+
eval_metrics: ["bleu"]
|
64 |
+
sacrebleu_cfg:
|
65 |
+
tokenize: "13a"
|
66 |
+
lowercase: True
|
67 |
+
|
68 |
+
training:
|
69 |
+
#load_model: "iwslt14_prompt/latest.ckpt"
|
70 |
+
#reset_best_ckpt: True
|
71 |
+
#reset_scheduler: True
|
72 |
+
#reset_optimizer: True
|
73 |
+
#reset_iter_state: True
|
74 |
+
optimizer: "adamw"
|
75 |
+
normalization: "tokens"
|
76 |
+
adam_betas: [0.9, 0.98]
|
77 |
+
scheduling: "warmupinversesquareroot"
|
78 |
+
learning_rate_warmup: 10000
|
79 |
+
learning_rate: 0.0002
|
80 |
+
learning_rate_min: 0.0000001
|
81 |
+
weight_decay: 0.001
|
82 |
+
label_smoothing: 0.1
|
83 |
+
loss: "crossentropy"
|
84 |
+
batch_size: 32
|
85 |
+
batch_type: "sentence"
|
86 |
+
batch_multiplier: 4
|
87 |
+
early_stopping_metric: "bleu"
|
88 |
+
epochs: 50
|
89 |
+
validation_freq: 1000
|
90 |
+
logging_freq: 100
|
91 |
+
overwrite: False
|
92 |
+
shuffle: True
|
93 |
+
print_valid_sents: [0, 1, 2, 3]
|
94 |
+
keep_best_ckpts: 5
|
95 |
+
|
96 |
+
model:
|
97 |
+
initializer: "xavier_uniform"
|
98 |
+
bias_initializer: "zeros"
|
99 |
+
init_gain: 1.0
|
100 |
+
embed_initializer: "xavier_uniform"
|
101 |
+
embed_init_gain: 1.0
|
102 |
+
tied_embeddings: True
|
103 |
+
tied_softmax: True
|
104 |
+
encoder:
|
105 |
+
type: "transformer"
|
106 |
+
num_layers: 6
|
107 |
+
num_heads: 8
|
108 |
+
embeddings:
|
109 |
+
embedding_dim: 1024
|
110 |
+
scale: True
|
111 |
+
dropout: 0.1
|
112 |
+
# typically ff_size = 4 x hidden_size
|
113 |
+
hidden_size: 1024
|
114 |
+
ff_size: 4096
|
115 |
+
dropout: 0.1
|
116 |
+
layer_norm: "pre"
|
117 |
+
activation: "relu"
|
118 |
+
decoder:
|
119 |
+
type: "transformer"
|
120 |
+
num_layers: 6
|
121 |
+
num_heads: 8
|
122 |
+
embeddings:
|
123 |
+
embedding_dim: 1024
|
124 |
+
scale: True
|
125 |
+
dropout: 0.1
|
126 |
+
# typically ff_size = 4 x hidden_size
|
127 |
+
hidden_size: 1024
|
128 |
+
ff_size: 4096
|
129 |
+
dropout: 0.1
|
130 |
+
layer_norm: "pre"
|
131 |
+
activation: "relu"
|
hyp.test
ADDED
The diff for this file is too large to render.
See raw diff
|
|
sp.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aaa7aad830134efe55afcbd6690ec305a7ed1599b10e499d6be3133fb6872251
|
3 |
+
size 832871
|
src_vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.de-en.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-01-16 17:27:15,168 - INFO - root - Hello! This is Joey-NMT (version 2.3.0).
|
2 |
+
2024-01-16 17:27:15,255 - INFO - joeynmt.data - Building tokenizer...
|
3 |
+
2024-01-16 17:27:15,340 - INFO - joeynmt.tokenizers - src tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
|
4 |
+
2024-01-16 17:27:15,341 - INFO - joeynmt.tokenizers - trg tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
|
5 |
+
2024-01-16 17:27:15,341 - INFO - joeynmt.data - Building vocabulary...
|
6 |
+
2024-01-16 17:28:01,074 - INFO - root - Hello! This is Joey-NMT (version 2.3.0).
|
7 |
+
2024-01-16 17:28:01,157 - INFO - joeynmt.data - Building tokenizer...
|
8 |
+
2024-01-16 17:28:01,241 - INFO - joeynmt.tokenizers - src tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
|
9 |
+
2024-01-16 17:28:01,242 - INFO - joeynmt.tokenizers - trg tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
|
10 |
+
2024-01-16 17:28:01,242 - INFO - joeynmt.data - Building vocabulary...
|
11 |
+
2024-01-16 17:28:11,213 - INFO - joeynmt.data - Loading test set...
|
12 |
+
2024-01-16 17:28:11,970 - INFO - numexpr.utils - Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
|
13 |
+
2024-01-16 17:28:11,970 - INFO - numexpr.utils - Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
|
14 |
+
2024-01-16 17:28:13,195 - INFO - joeynmt.data - Data loaded.
|
15 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - Train dataset: None
|
16 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - Valid dataset: None
|
17 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - Test dataset: TsvDataset(split=test, len=6743, src_lang=src, trg_lang=trg, has_trg=True, random_subset=-1, has_src_prompt=True, has_trg_prompt=True)
|
18 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - First 10 Src tokens: (0) <unk> (1) <pad> (2) <s> (3) </s> (4) <sep> (5) <de> (6) <en> (7) <fr> (8) , (9) .
|
19 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - First 10 Trg tokens: (0) <unk> (1) <pad> (2) <s> (3) </s> (4) <sep> (5) <de> (6) <en> (7) <fr> (8) , (9) .
|
20 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - Number of unique Src tokens (vocab_size): 32000
|
21 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.data - Number of unique Trg tokens (vocab_size): 32000
|
22 |
+
2024-01-16 17:28:13,196 - INFO - joeynmt.model - Building an encoder-decoder model...
|
23 |
+
2024-01-16 17:28:16,416 - INFO - joeynmt.model - Enc-dec model built.
|
24 |
+
2024-01-16 17:28:16,419 - INFO - joeynmt.model - Total params: 209129472
|
25 |
+
2024-01-16 17:28:16,420 - DEBUG - joeynmt.model - Trainable parameters: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.dec_layer_norm.bias', 'decoder.layers.0.dec_layer_norm.weight', 'decoder.layers.0.feed_forward.layer_norm.bias', 'decoder.layers.0.feed_forward.layer_norm.weight', 'decoder.layers.0.feed_forward.pwff_layer.0.bias', 'decoder.layers.0.feed_forward.pwff_layer.0.weight', 'decoder.layers.0.feed_forward.pwff_layer.3.bias', 'decoder.layers.0.feed_forward.pwff_layer.3.weight', 'decoder.layers.0.src_trg_att.k_layer.bias', 'decoder.layers.0.src_trg_att.k_layer.weight', 'decoder.layers.0.src_trg_att.output_layer.bias', 'decoder.layers.0.src_trg_att.output_layer.weight', 'decoder.layers.0.src_trg_att.q_layer.bias', 'decoder.layers.0.src_trg_att.q_layer.weight', 'decoder.layers.0.src_trg_att.v_layer.bias', 'decoder.layers.0.src_trg_att.v_layer.weight', 'decoder.layers.0.trg_trg_att.k_layer.bias', 'decoder.layers.0.trg_trg_att.k_layer.weight', 'decoder.layers.0.trg_trg_att.output_layer.bias', 'decoder.layers.0.trg_trg_att.output_layer.weight', 'decoder.layers.0.trg_trg_att.q_layer.bias', 'decoder.layers.0.trg_trg_att.q_layer.weight', 'decoder.layers.0.trg_trg_att.v_layer.bias', 'decoder.layers.0.trg_trg_att.v_layer.weight', 'decoder.layers.0.x_layer_norm.bias', 'decoder.layers.0.x_layer_norm.weight', 'decoder.layers.1.dec_layer_norm.bias', 'decoder.layers.1.dec_layer_norm.weight', 'decoder.layers.1.feed_forward.layer_norm.bias', 'decoder.layers.1.feed_forward.layer_norm.weight', 'decoder.layers.1.feed_forward.pwff_layer.0.bias', 'decoder.layers.1.feed_forward.pwff_layer.0.weight', 'decoder.layers.1.feed_forward.pwff_layer.3.bias', 'decoder.layers.1.feed_forward.pwff_layer.3.weight', 'decoder.layers.1.src_trg_att.k_layer.bias', 'decoder.layers.1.src_trg_att.k_layer.weight', 'decoder.layers.1.src_trg_att.output_layer.bias', 'decoder.layers.1.src_trg_att.output_layer.weight', 'decoder.layers.1.src_trg_att.q_layer.bias', 'decoder.layers.1.src_trg_att.q_layer.weight', 'decoder.layers.1.src_trg_att.v_layer.bias', 'decoder.layers.1.src_trg_att.v_layer.weight', 'decoder.layers.1.trg_trg_att.k_layer.bias', 'decoder.layers.1.trg_trg_att.k_layer.weight', 'decoder.layers.1.trg_trg_att.output_layer.bias', 'decoder.layers.1.trg_trg_att.output_layer.weight', 'decoder.layers.1.trg_trg_att.q_layer.bias', 'decoder.layers.1.trg_trg_att.q_layer.weight', 'decoder.layers.1.trg_trg_att.v_layer.bias', 'decoder.layers.1.trg_trg_att.v_layer.weight', 'decoder.layers.1.x_layer_norm.bias', 'decoder.layers.1.x_layer_norm.weight', 'decoder.layers.2.dec_layer_norm.bias', 'decoder.layers.2.dec_layer_norm.weight', 'decoder.layers.2.feed_forward.layer_norm.bias', 'decoder.layers.2.feed_forward.layer_norm.weight', 'decoder.layers.2.feed_forward.pwff_layer.0.bias', 'decoder.layers.2.feed_forward.pwff_layer.0.weight', 'decoder.layers.2.feed_forward.pwff_layer.3.bias', 'decoder.layers.2.feed_forward.pwff_layer.3.weight', 'decoder.layers.2.src_trg_att.k_layer.bias', 'decoder.layers.2.src_trg_att.k_layer.weight', 'decoder.layers.2.src_trg_att.output_layer.bias', 'decoder.layers.2.src_trg_att.output_layer.weight', 'decoder.layers.2.src_trg_att.q_layer.bias', 'decoder.layers.2.src_trg_att.q_layer.weight', 'decoder.layers.2.src_trg_att.v_layer.bias', 'decoder.layers.2.src_trg_att.v_layer.weight', 'decoder.layers.2.trg_trg_att.k_layer.bias', 'decoder.layers.2.trg_trg_att.k_layer.weight', 'decoder.layers.2.trg_trg_att.output_layer.bias', 'decoder.layers.2.trg_trg_att.output_layer.weight', 'decoder.layers.2.trg_trg_att.q_layer.bias', 'decoder.layers.2.trg_trg_att.q_layer.weight', 'decoder.layers.2.trg_trg_att.v_layer.bias', 'decoder.layers.2.trg_trg_att.v_layer.weight', 'decoder.layers.2.x_layer_norm.bias', 'decoder.layers.2.x_layer_norm.weight', 'decoder.layers.3.dec_layer_norm.bias', 'decoder.layers.3.dec_layer_norm.weight', 'decoder.layers.3.feed_forward.layer_norm.bias', 'decoder.layers.3.feed_forward.layer_norm.weight', 'decoder.layers.3.feed_forward.pwff_layer.0.bias', 'decoder.layers.3.feed_forward.pwff_layer.0.weight', 'decoder.layers.3.feed_forward.pwff_layer.3.bias', 'decoder.layers.3.feed_forward.pwff_layer.3.weight', 'decoder.layers.3.src_trg_att.k_layer.bias', 'decoder.layers.3.src_trg_att.k_layer.weight', 'decoder.layers.3.src_trg_att.output_layer.bias', 'decoder.layers.3.src_trg_att.output_layer.weight', 'decoder.layers.3.src_trg_att.q_layer.bias', 'decoder.layers.3.src_trg_att.q_layer.weight', 'decoder.layers.3.src_trg_att.v_layer.bias', 'decoder.layers.3.src_trg_att.v_layer.weight', 'decoder.layers.3.trg_trg_att.k_layer.bias', 'decoder.layers.3.trg_trg_att.k_layer.weight', 'decoder.layers.3.trg_trg_att.output_layer.bias', 'decoder.layers.3.trg_trg_att.output_layer.weight', 'decoder.layers.3.trg_trg_att.q_layer.bias', 'decoder.layers.3.trg_trg_att.q_layer.weight', 'decoder.layers.3.trg_trg_att.v_layer.bias', 'decoder.layers.3.trg_trg_att.v_layer.weight', 'decoder.layers.3.x_layer_norm.bias', 'decoder.layers.3.x_layer_norm.weight', 'decoder.layers.4.dec_layer_norm.bias', 'decoder.layers.4.dec_layer_norm.weight', 'decoder.layers.4.feed_forward.layer_norm.bias', 'decoder.layers.4.feed_forward.layer_norm.weight', 'decoder.layers.4.feed_forward.pwff_layer.0.bias', 'decoder.layers.4.feed_forward.pwff_layer.0.weight', 'decoder.layers.4.feed_forward.pwff_layer.3.bias', 'decoder.layers.4.feed_forward.pwff_layer.3.weight', 'decoder.layers.4.src_trg_att.k_layer.bias', 'decoder.layers.4.src_trg_att.k_layer.weight', 'decoder.layers.4.src_trg_att.output_layer.bias', 'decoder.layers.4.src_trg_att.output_layer.weight', 'decoder.layers.4.src_trg_att.q_layer.bias', 'decoder.layers.4.src_trg_att.q_layer.weight', 'decoder.layers.4.src_trg_att.v_layer.bias', 'decoder.layers.4.src_trg_att.v_layer.weight', 'decoder.layers.4.trg_trg_att.k_layer.bias', 'decoder.layers.4.trg_trg_att.k_layer.weight', 'decoder.layers.4.trg_trg_att.output_layer.bias', 'decoder.layers.4.trg_trg_att.output_layer.weight', 'decoder.layers.4.trg_trg_att.q_layer.bias', 'decoder.layers.4.trg_trg_att.q_layer.weight', 'decoder.layers.4.trg_trg_att.v_layer.bias', 'decoder.layers.4.trg_trg_att.v_layer.weight', 'decoder.layers.4.x_layer_norm.bias', 'decoder.layers.4.x_layer_norm.weight', 'decoder.layers.5.dec_layer_norm.bias', 'decoder.layers.5.dec_layer_norm.weight', 'decoder.layers.5.feed_forward.layer_norm.bias', 'decoder.layers.5.feed_forward.layer_norm.weight', 'decoder.layers.5.feed_forward.pwff_layer.0.bias', 'decoder.layers.5.feed_forward.pwff_layer.0.weight', 'decoder.layers.5.feed_forward.pwff_layer.3.bias', 'decoder.layers.5.feed_forward.pwff_layer.3.weight', 'decoder.layers.5.src_trg_att.k_layer.bias', 'decoder.layers.5.src_trg_att.k_layer.weight', 'decoder.layers.5.src_trg_att.output_layer.bias', 'decoder.layers.5.src_trg_att.output_layer.weight', 'decoder.layers.5.src_trg_att.q_layer.bias', 'decoder.layers.5.src_trg_att.q_layer.weight', 'decoder.layers.5.src_trg_att.v_layer.bias', 'decoder.layers.5.src_trg_att.v_layer.weight', 'decoder.layers.5.trg_trg_att.k_layer.bias', 'decoder.layers.5.trg_trg_att.k_layer.weight', 'decoder.layers.5.trg_trg_att.output_layer.bias', 'decoder.layers.5.trg_trg_att.output_layer.weight', 'decoder.layers.5.trg_trg_att.q_layer.bias', 'decoder.layers.5.trg_trg_att.q_layer.weight', 'decoder.layers.5.trg_trg_att.v_layer.bias', 'decoder.layers.5.trg_trg_att.v_layer.weight', 'decoder.layers.5.x_layer_norm.bias', 'decoder.layers.5.x_layer_norm.weight', 'encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.feed_forward.layer_norm.bias', 'encoder.layers.0.feed_forward.layer_norm.weight', 'encoder.layers.0.feed_forward.pwff_layer.0.bias', 'encoder.layers.0.feed_forward.pwff_layer.0.weight', 'encoder.layers.0.feed_forward.pwff_layer.3.bias', 'encoder.layers.0.feed_forward.pwff_layer.3.weight', 'encoder.layers.0.layer_norm.bias', 'encoder.layers.0.layer_norm.weight', 'encoder.layers.0.src_src_att.k_layer.bias', 'encoder.layers.0.src_src_att.k_layer.weight', 'encoder.layers.0.src_src_att.output_layer.bias', 'encoder.layers.0.src_src_att.output_layer.weight', 'encoder.layers.0.src_src_att.q_layer.bias', 'encoder.layers.0.src_src_att.q_layer.weight', 'encoder.layers.0.src_src_att.v_layer.bias', 'encoder.layers.0.src_src_att.v_layer.weight', 'encoder.layers.1.feed_forward.layer_norm.bias', 'encoder.layers.1.feed_forward.layer_norm.weight', 'encoder.layers.1.feed_forward.pwff_layer.0.bias', 'encoder.layers.1.feed_forward.pwff_layer.0.weight', 'encoder.layers.1.feed_forward.pwff_layer.3.bias', 'encoder.layers.1.feed_forward.pwff_layer.3.weight', 'encoder.layers.1.layer_norm.bias', 'encoder.layers.1.layer_norm.weight', 'encoder.layers.1.src_src_att.k_layer.bias', 'encoder.layers.1.src_src_att.k_layer.weight', 'encoder.layers.1.src_src_att.output_layer.bias', 'encoder.layers.1.src_src_att.output_layer.weight', 'encoder.layers.1.src_src_att.q_layer.bias', 'encoder.layers.1.src_src_att.q_layer.weight', 'encoder.layers.1.src_src_att.v_layer.bias', 'encoder.layers.1.src_src_att.v_layer.weight', 'encoder.layers.2.feed_forward.layer_norm.bias', 'encoder.layers.2.feed_forward.layer_norm.weight', 'encoder.layers.2.feed_forward.pwff_layer.0.bias', 'encoder.layers.2.feed_forward.pwff_layer.0.weight', 'encoder.layers.2.feed_forward.pwff_layer.3.bias', 'encoder.layers.2.feed_forward.pwff_layer.3.weight', 'encoder.layers.2.layer_norm.bias', 'encoder.layers.2.layer_norm.weight', 'encoder.layers.2.src_src_att.k_layer.bias', 'encoder.layers.2.src_src_att.k_layer.weight', 'encoder.layers.2.src_src_att.output_layer.bias', 'encoder.layers.2.src_src_att.output_layer.weight', 'encoder.layers.2.src_src_att.q_layer.bias', 'encoder.layers.2.src_src_att.q_layer.weight', 'encoder.layers.2.src_src_att.v_layer.bias', 'encoder.layers.2.src_src_att.v_layer.weight', 'encoder.layers.3.feed_forward.layer_norm.bias', 'encoder.layers.3.feed_forward.layer_norm.weight', 'encoder.layers.3.feed_forward.pwff_layer.0.bias', 'encoder.layers.3.feed_forward.pwff_layer.0.weight', 'encoder.layers.3.feed_forward.pwff_layer.3.bias', 'encoder.layers.3.feed_forward.pwff_layer.3.weight', 'encoder.layers.3.layer_norm.bias', 'encoder.layers.3.layer_norm.weight', 'encoder.layers.3.src_src_att.k_layer.bias', 'encoder.layers.3.src_src_att.k_layer.weight', 'encoder.layers.3.src_src_att.output_layer.bias', 'encoder.layers.3.src_src_att.output_layer.weight', 'encoder.layers.3.src_src_att.q_layer.bias', 'encoder.layers.3.src_src_att.q_layer.weight', 'encoder.layers.3.src_src_att.v_layer.bias', 'encoder.layers.3.src_src_att.v_layer.weight', 'encoder.layers.4.feed_forward.layer_norm.bias', 'encoder.layers.4.feed_forward.layer_norm.weight', 'encoder.layers.4.feed_forward.pwff_layer.0.bias', 'encoder.layers.4.feed_forward.pwff_layer.0.weight', 'encoder.layers.4.feed_forward.pwff_layer.3.bias', 'encoder.layers.4.feed_forward.pwff_layer.3.weight', 'encoder.layers.4.layer_norm.bias', 'encoder.layers.4.layer_norm.weight', 'encoder.layers.4.src_src_att.k_layer.bias', 'encoder.layers.4.src_src_att.k_layer.weight', 'encoder.layers.4.src_src_att.output_layer.bias', 'encoder.layers.4.src_src_att.output_layer.weight', 'encoder.layers.4.src_src_att.q_layer.bias', 'encoder.layers.4.src_src_att.q_layer.weight', 'encoder.layers.4.src_src_att.v_layer.bias', 'encoder.layers.4.src_src_att.v_layer.weight', 'encoder.layers.5.feed_forward.layer_norm.bias', 'encoder.layers.5.feed_forward.layer_norm.weight', 'encoder.layers.5.feed_forward.pwff_layer.0.bias', 'encoder.layers.5.feed_forward.pwff_layer.0.weight', 'encoder.layers.5.feed_forward.pwff_layer.3.bias', 'encoder.layers.5.feed_forward.pwff_layer.3.weight', 'encoder.layers.5.layer_norm.bias', 'encoder.layers.5.layer_norm.weight', 'encoder.layers.5.src_src_att.k_layer.bias', 'encoder.layers.5.src_src_att.k_layer.weight', 'encoder.layers.5.src_src_att.output_layer.bias', 'encoder.layers.5.src_src_att.output_layer.weight', 'encoder.layers.5.src_src_att.q_layer.bias', 'encoder.layers.5.src_src_att.q_layer.weight', 'encoder.layers.5.src_src_att.v_layer.bias', 'encoder.layers.5.src_src_att.v_layer.weight', 'src_embed.lut.weight']
|
26 |
+
2024-01-16 17:28:16,421 - INFO - joeynmt.prediction - Loading model from models/iwslt14_prompt/avg5.ckpt
|
27 |
+
2024-01-16 17:28:24,668 - INFO - joeynmt.prediction - DataParallelWrapper(
|
28 |
+
(module): DataParallel(
|
29 |
+
(module): Model(
|
30 |
+
encoder=TransformerEncoder(num_layers=6, num_heads=8, alpha=1.0, layer_norm="pre", activation=ReLU()),
|
31 |
+
decoder=TransformerDecoder(num_layers=6, num_heads=8, alpha=1.0, layer_norm="pre", activation=ReLU()),
|
32 |
+
src_embed=Embeddings(embedding_dim=1024, vocab_size=32000),
|
33 |
+
trg_embed=Embeddings(embedding_dim=1024, vocab_size=32000),
|
34 |
+
loss_function=XentLoss(criterion=KLDivLoss(), smoothing=0.1))
|
35 |
+
)
|
36 |
+
)
|
37 |
+
2024-01-16 17:28:24,681 - INFO - joeynmt.prediction - Decoding on test set... (device: cuda, n_gpu: 2, use_ddp: False, fp16: True)
|
38 |
+
2024-01-16 17:28:24,681 - INFO - joeynmt.prediction - Predicting 6743 example(s)... (Beam search with beam_size=5, beam_alpha=1.0, n_best=1, min_output_length=1, max_output_length=512, return_prob='none', generate_unk=True, repetition_penalty=-1, no_repeat_ngram_size=-1)
|
39 |
+
2024-01-16 18:19:58,646 - INFO - joeynmt.prediction - Generation took 3093.9641[sec].
|
40 |
+
2024-01-16 18:20:00,112 - INFO - joeynmt.metrics - nrefs:1|case:lc|eff:no|tok:13a|smooth:exp|version:2.4.0
|
41 |
+
2024-01-16 18:20:00,115 - INFO - joeynmt.prediction - Evaluation result (beam search): bleu: 35.28, 1.3136[sec]
|
42 |
+
2024-01-16 18:20:00,128 - INFO - joeynmt.prediction - Translations saved to: iwslt14_prompt/hyp.test.
|
train.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
trg_vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
validations.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Steps: 1000 loss: 5.33320 acc: 0.13981 ppl: 207.09943 bleu: 0.01656 LR: 0.00002000 *
|
2 |
+
Steps: 2000 loss: 4.71079 acc: 0.19451 ppl: 111.13951 bleu: 0.55809 LR: 0.00004000 *
|
3 |
+
Steps: 3000 loss: 4.20917 acc: 0.26398 ppl: 67.30097 bleu: 2.02249 LR: 0.00006000 *
|
4 |
+
Steps: 4000 loss: 3.50676 acc: 0.36811 ppl: 33.33998 bleu: 6.85361 LR: 0.00008000 *
|
5 |
+
Steps: 5000 loss: 3.10499 acc: 0.42404 ppl: 22.30898 bleu: 12.10052 LR: 0.00010000 *
|
6 |
+
Steps: 6000 loss: 2.74785 acc: 0.47426 ppl: 15.60911 bleu: 15.55999 LR: 0.00012000 *
|
7 |
+
Steps: 7000 loss: 2.59354 acc: 0.49875 ppl: 13.37698 bleu: 19.94671 LR: 0.00014000 *
|
8 |
+
Steps: 8000 loss: 2.49535 acc: 0.51680 ppl: 12.12593 bleu: 20.80670 LR: 0.00016000 *
|
9 |
+
Steps: 9000 loss: 2.32676 acc: 0.53745 ppl: 10.24473 bleu: 20.65046 LR: 0.00018000
|
10 |
+
Steps: 10000 loss: 2.32525 acc: 0.53374 ppl: 10.22927 bleu: 23.89984 LR: 0.00020000 *
|
11 |
+
Steps: 11000 loss: 2.26839 acc: 0.54531 ppl: 9.66380 bleu: 24.47579 LR: 0.00019069 *
|
12 |
+
Steps: 12000 loss: 2.21879 acc: 0.55346 ppl: 9.19621 bleu: 26.33193 LR: 0.00018257 *
|
13 |
+
Steps: 13000 loss: 2.11914 acc: 0.56789 ppl: 8.32394 bleu: 25.80609 LR: 0.00017541
|
14 |
+
Steps: 14000 loss: 2.13339 acc: 0.57107 ppl: 8.44348 bleu: 27.53361 LR: 0.00016903 *
|
15 |
+
Steps: 15000 loss: 2.10158 acc: 0.57200 ppl: 8.17911 bleu: 25.89823 LR: 0.00016330
|
16 |
+
Steps: 16000 loss: 2.01063 acc: 0.58820 ppl: 7.46800 bleu: 26.88964 LR: 0.00015811
|
17 |
+
Steps: 17000 loss: 2.03856 acc: 0.58597 ppl: 7.67955 bleu: 28.32236 LR: 0.00015339 *
|
18 |
+
Steps: 18000 loss: 2.04342 acc: 0.58335 ppl: 7.71696 bleu: 26.54742 LR: 0.00014907
|
19 |
+
Steps: 19000 loss: 2.02208 acc: 0.58512 ppl: 7.55403 bleu: 27.41188 LR: 0.00014510
|
20 |
+
Steps: 20000 loss: 1.96965 acc: 0.59832 ppl: 7.16818 bleu: 27.44443 LR: 0.00014142
|
21 |
+
Steps: 21000 loss: 2.00185 acc: 0.58957 ppl: 7.40273 bleu: 27.51776 LR: 0.00013801
|
22 |
+
Steps: 22000 loss: 1.91933 acc: 0.60069 ppl: 6.81640 bleu: 27.78165 LR: 0.00013484
|
23 |
+
Steps: 23000 loss: 1.93601 acc: 0.60446 ppl: 6.93107 bleu: 29.21029 LR: 0.00013188 *
|
24 |
+
Steps: 24000 loss: 1.95600 acc: 0.59864 ppl: 7.07096 bleu: 27.79109 LR: 0.00012910
|
25 |
+
Steps: 25000 loss: 2.00381 acc: 0.59484 ppl: 7.41723 bleu: 26.95928 LR: 0.00012649
|
26 |
+
Steps: 26000 loss: 2.05070 acc: 0.58464 ppl: 7.77331 bleu: 27.30420 LR: 0.00012403
|
27 |
+
Steps: 27000 loss: 1.94814 acc: 0.59961 ppl: 7.01565 bleu: 28.05348 LR: 0.00012172
|
28 |
+
Steps: 28000 loss: 2.04584 acc: 0.58991 ppl: 7.73569 bleu: 26.18726 LR: 0.00011952
|
29 |
+
Steps: 29000 loss: 2.09793 acc: 0.58589 ppl: 8.14927 bleu: 25.79140 LR: 0.00011744
|