may-ohta commited on
Commit
7657bb3
1 Parent(s): 68a88fe

initial commit

Browse files
Files changed (10) hide show
  1. avg5.ckpt +3 -0
  2. config.yaml +131 -0
  3. hyp.test +0 -0
  4. sp.model +3 -0
  5. src_vocab.txt +0 -0
  6. test.de-en.tsv +0 -0
  7. test.log +42 -0
  8. train.log +0 -0
  9. trg_vocab.txt +0 -0
  10. validations.txt +29 -0
avg5.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16785664244d8110805afb1ced49b7f8fc5aec327e1ac6cd2024f4bfd3c0f55a
3
+ size 2812902554
config.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "iwslt14_deenfr_prompt"
2
+ joeynmt_version: "2.3.0"
3
+ model_dir: "iwslt14_prompt"
4
+ use_cuda: True
5
+ fp16: True
6
+ random_seed: 42
7
+
8
+ data:
9
+ #train: "iwslt14_prompt/train" # cf. https://wit3.fbk.eu/2014-01
10
+ #dev: "iwslt14_prompt/dev"
11
+ test: "iwslt14_prompt/test.de-en" # ['TED.dev2010', 'TEDX.dev2012', 'TED.tst2010', 'TED.tst2011', 'TED.tst2012']
12
+ dataset_type: "tsv"
13
+ sample_dev_subset: 500
14
+ src:
15
+ lang: "src"
16
+ max_length: 512
17
+ lowercase: False
18
+ normalize: False
19
+ level: "bpe"
20
+ voc_limit: 32000
21
+ voc_min_freq: 1
22
+ voc_file: "iwslt14_prompt/src_vocab.txt"
23
+ tokenizer_type: "sentencepiece"
24
+ tokenizer_cfg:
25
+ model_file: "iwslt14_prompt/sp.model"
26
+ model_type: "unigram"
27
+ character_coverage: 1.0
28
+ trg:
29
+ lang: "trg"
30
+ max_length: 512
31
+ lowercase: False
32
+ normalize: False
33
+ level: "bpe"
34
+ voc_limit: 32000
35
+ voc_min_freq: 1
36
+ voc_file: "iwslt14_prompt/trg_vocab.txt"
37
+ tokenizer_type: "sentencepiece"
38
+ tokenizer_cfg:
39
+ model_file: "iwslt14_prompt/sp.model"
40
+ model_type: "unigram"
41
+ character_coverage: 1.0
42
+ special_symbols:
43
+ unk_token: "<unk>"
44
+ unk_id: 0
45
+ pad_token: "<pad>"
46
+ pad_id: 1
47
+ bos_token: "<s>"
48
+ bos_id: 2
49
+ eos_token: "</s>"
50
+ eos_id: 3
51
+ sep_token: "<sep>"
52
+ sep_id: 4
53
+ lang_tags: ["<de>", "<en>", "<fr>"]
54
+
55
+ testing:
56
+ load_model: "iwslt14_prompt/avg5.ckpt"
57
+ n_best: 1
58
+ beam_size: 5
59
+ beam_alpha: 1.0
60
+ batch_size: 32
61
+ batch_type: "sentence"
62
+ max_output_length: 512
63
+ eval_metrics: ["bleu"]
64
+ sacrebleu_cfg:
65
+ tokenize: "13a"
66
+ lowercase: True
67
+
68
+ training:
69
+ #load_model: "iwslt14_prompt/latest.ckpt"
70
+ #reset_best_ckpt: True
71
+ #reset_scheduler: True
72
+ #reset_optimizer: True
73
+ #reset_iter_state: True
74
+ optimizer: "adamw"
75
+ normalization: "tokens"
76
+ adam_betas: [0.9, 0.98]
77
+ scheduling: "warmupinversesquareroot"
78
+ learning_rate_warmup: 10000
79
+ learning_rate: 0.0002
80
+ learning_rate_min: 0.0000001
81
+ weight_decay: 0.001
82
+ label_smoothing: 0.1
83
+ loss: "crossentropy"
84
+ batch_size: 32
85
+ batch_type: "sentence"
86
+ batch_multiplier: 4
87
+ early_stopping_metric: "bleu"
88
+ epochs: 50
89
+ validation_freq: 1000
90
+ logging_freq: 100
91
+ overwrite: False
92
+ shuffle: True
93
+ print_valid_sents: [0, 1, 2, 3]
94
+ keep_best_ckpts: 5
95
+
96
+ model:
97
+ initializer: "xavier_uniform"
98
+ bias_initializer: "zeros"
99
+ init_gain: 1.0
100
+ embed_initializer: "xavier_uniform"
101
+ embed_init_gain: 1.0
102
+ tied_embeddings: True
103
+ tied_softmax: True
104
+ encoder:
105
+ type: "transformer"
106
+ num_layers: 6
107
+ num_heads: 8
108
+ embeddings:
109
+ embedding_dim: 1024
110
+ scale: True
111
+ dropout: 0.1
112
+ # typically ff_size = 4 x hidden_size
113
+ hidden_size: 1024
114
+ ff_size: 4096
115
+ dropout: 0.1
116
+ layer_norm: "pre"
117
+ activation: "relu"
118
+ decoder:
119
+ type: "transformer"
120
+ num_layers: 6
121
+ num_heads: 8
122
+ embeddings:
123
+ embedding_dim: 1024
124
+ scale: True
125
+ dropout: 0.1
126
+ # typically ff_size = 4 x hidden_size
127
+ hidden_size: 1024
128
+ ff_size: 4096
129
+ dropout: 0.1
130
+ layer_norm: "pre"
131
+ activation: "relu"
hyp.test ADDED
The diff for this file is too large to render. See raw diff
 
sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa7aad830134efe55afcbd6690ec305a7ed1599b10e499d6be3133fb6872251
3
+ size 832871
src_vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
test.de-en.tsv ADDED
The diff for this file is too large to render. See raw diff
 
test.log ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-01-16 17:27:15,168 - INFO - root - Hello! This is Joey-NMT (version 2.3.0).
2
+ 2024-01-16 17:27:15,255 - INFO - joeynmt.data - Building tokenizer...
3
+ 2024-01-16 17:27:15,340 - INFO - joeynmt.tokenizers - src tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
4
+ 2024-01-16 17:27:15,341 - INFO - joeynmt.tokenizers - trg tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
5
+ 2024-01-16 17:27:15,341 - INFO - joeynmt.data - Building vocabulary...
6
+ 2024-01-16 17:28:01,074 - INFO - root - Hello! This is Joey-NMT (version 2.3.0).
7
+ 2024-01-16 17:28:01,157 - INFO - joeynmt.data - Building tokenizer...
8
+ 2024-01-16 17:28:01,241 - INFO - joeynmt.tokenizers - src tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
9
+ 2024-01-16 17:28:01,242 - INFO - joeynmt.tokenizers - trg tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 512), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
10
+ 2024-01-16 17:28:01,242 - INFO - joeynmt.data - Building vocabulary...
11
+ 2024-01-16 17:28:11,213 - INFO - joeynmt.data - Loading test set...
12
+ 2024-01-16 17:28:11,970 - INFO - numexpr.utils - Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
13
+ 2024-01-16 17:28:11,970 - INFO - numexpr.utils - Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
14
+ 2024-01-16 17:28:13,195 - INFO - joeynmt.data - Data loaded.
15
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - Train dataset: None
16
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - Valid dataset: None
17
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - Test dataset: TsvDataset(split=test, len=6743, src_lang=src, trg_lang=trg, has_trg=True, random_subset=-1, has_src_prompt=True, has_trg_prompt=True)
18
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - First 10 Src tokens: (0) <unk> (1) <pad> (2) <s> (3) </s> (4) <sep> (5) <de> (6) <en> (7) <fr> (8) , (9) .
19
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - First 10 Trg tokens: (0) <unk> (1) <pad> (2) <s> (3) </s> (4) <sep> (5) <de> (6) <en> (7) <fr> (8) , (9) .
20
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - Number of unique Src tokens (vocab_size): 32000
21
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.data - Number of unique Trg tokens (vocab_size): 32000
22
+ 2024-01-16 17:28:13,196 - INFO - joeynmt.model - Building an encoder-decoder model...
23
+ 2024-01-16 17:28:16,416 - INFO - joeynmt.model - Enc-dec model built.
24
+ 2024-01-16 17:28:16,419 - INFO - joeynmt.model - Total params: 209129472
25
+ 2024-01-16 17:28:16,420 - DEBUG - joeynmt.model - Trainable parameters: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.dec_layer_norm.bias', 'decoder.layers.0.dec_layer_norm.weight', 'decoder.layers.0.feed_forward.layer_norm.bias', 'decoder.layers.0.feed_forward.layer_norm.weight', 'decoder.layers.0.feed_forward.pwff_layer.0.bias', 'decoder.layers.0.feed_forward.pwff_layer.0.weight', 'decoder.layers.0.feed_forward.pwff_layer.3.bias', 'decoder.layers.0.feed_forward.pwff_layer.3.weight', 'decoder.layers.0.src_trg_att.k_layer.bias', 'decoder.layers.0.src_trg_att.k_layer.weight', 'decoder.layers.0.src_trg_att.output_layer.bias', 'decoder.layers.0.src_trg_att.output_layer.weight', 'decoder.layers.0.src_trg_att.q_layer.bias', 'decoder.layers.0.src_trg_att.q_layer.weight', 'decoder.layers.0.src_trg_att.v_layer.bias', 'decoder.layers.0.src_trg_att.v_layer.weight', 'decoder.layers.0.trg_trg_att.k_layer.bias', 'decoder.layers.0.trg_trg_att.k_layer.weight', 'decoder.layers.0.trg_trg_att.output_layer.bias', 'decoder.layers.0.trg_trg_att.output_layer.weight', 'decoder.layers.0.trg_trg_att.q_layer.bias', 'decoder.layers.0.trg_trg_att.q_layer.weight', 'decoder.layers.0.trg_trg_att.v_layer.bias', 'decoder.layers.0.trg_trg_att.v_layer.weight', 'decoder.layers.0.x_layer_norm.bias', 'decoder.layers.0.x_layer_norm.weight', 'decoder.layers.1.dec_layer_norm.bias', 'decoder.layers.1.dec_layer_norm.weight', 'decoder.layers.1.feed_forward.layer_norm.bias', 'decoder.layers.1.feed_forward.layer_norm.weight', 'decoder.layers.1.feed_forward.pwff_layer.0.bias', 'decoder.layers.1.feed_forward.pwff_layer.0.weight', 'decoder.layers.1.feed_forward.pwff_layer.3.bias', 'decoder.layers.1.feed_forward.pwff_layer.3.weight', 'decoder.layers.1.src_trg_att.k_layer.bias', 'decoder.layers.1.src_trg_att.k_layer.weight', 'decoder.layers.1.src_trg_att.output_layer.bias', 'decoder.layers.1.src_trg_att.output_layer.weight', 'decoder.layers.1.src_trg_att.q_layer.bias', 'decoder.layers.1.src_trg_att.q_layer.weight', 'decoder.layers.1.src_trg_att.v_layer.bias', 'decoder.layers.1.src_trg_att.v_layer.weight', 'decoder.layers.1.trg_trg_att.k_layer.bias', 'decoder.layers.1.trg_trg_att.k_layer.weight', 'decoder.layers.1.trg_trg_att.output_layer.bias', 'decoder.layers.1.trg_trg_att.output_layer.weight', 'decoder.layers.1.trg_trg_att.q_layer.bias', 'decoder.layers.1.trg_trg_att.q_layer.weight', 'decoder.layers.1.trg_trg_att.v_layer.bias', 'decoder.layers.1.trg_trg_att.v_layer.weight', 'decoder.layers.1.x_layer_norm.bias', 'decoder.layers.1.x_layer_norm.weight', 'decoder.layers.2.dec_layer_norm.bias', 'decoder.layers.2.dec_layer_norm.weight', 'decoder.layers.2.feed_forward.layer_norm.bias', 'decoder.layers.2.feed_forward.layer_norm.weight', 'decoder.layers.2.feed_forward.pwff_layer.0.bias', 'decoder.layers.2.feed_forward.pwff_layer.0.weight', 'decoder.layers.2.feed_forward.pwff_layer.3.bias', 'decoder.layers.2.feed_forward.pwff_layer.3.weight', 'decoder.layers.2.src_trg_att.k_layer.bias', 'decoder.layers.2.src_trg_att.k_layer.weight', 'decoder.layers.2.src_trg_att.output_layer.bias', 'decoder.layers.2.src_trg_att.output_layer.weight', 'decoder.layers.2.src_trg_att.q_layer.bias', 'decoder.layers.2.src_trg_att.q_layer.weight', 'decoder.layers.2.src_trg_att.v_layer.bias', 'decoder.layers.2.src_trg_att.v_layer.weight', 'decoder.layers.2.trg_trg_att.k_layer.bias', 'decoder.layers.2.trg_trg_att.k_layer.weight', 'decoder.layers.2.trg_trg_att.output_layer.bias', 'decoder.layers.2.trg_trg_att.output_layer.weight', 'decoder.layers.2.trg_trg_att.q_layer.bias', 'decoder.layers.2.trg_trg_att.q_layer.weight', 'decoder.layers.2.trg_trg_att.v_layer.bias', 'decoder.layers.2.trg_trg_att.v_layer.weight', 'decoder.layers.2.x_layer_norm.bias', 'decoder.layers.2.x_layer_norm.weight', 'decoder.layers.3.dec_layer_norm.bias', 'decoder.layers.3.dec_layer_norm.weight', 'decoder.layers.3.feed_forward.layer_norm.bias', 'decoder.layers.3.feed_forward.layer_norm.weight', 'decoder.layers.3.feed_forward.pwff_layer.0.bias', 'decoder.layers.3.feed_forward.pwff_layer.0.weight', 'decoder.layers.3.feed_forward.pwff_layer.3.bias', 'decoder.layers.3.feed_forward.pwff_layer.3.weight', 'decoder.layers.3.src_trg_att.k_layer.bias', 'decoder.layers.3.src_trg_att.k_layer.weight', 'decoder.layers.3.src_trg_att.output_layer.bias', 'decoder.layers.3.src_trg_att.output_layer.weight', 'decoder.layers.3.src_trg_att.q_layer.bias', 'decoder.layers.3.src_trg_att.q_layer.weight', 'decoder.layers.3.src_trg_att.v_layer.bias', 'decoder.layers.3.src_trg_att.v_layer.weight', 'decoder.layers.3.trg_trg_att.k_layer.bias', 'decoder.layers.3.trg_trg_att.k_layer.weight', 'decoder.layers.3.trg_trg_att.output_layer.bias', 'decoder.layers.3.trg_trg_att.output_layer.weight', 'decoder.layers.3.trg_trg_att.q_layer.bias', 'decoder.layers.3.trg_trg_att.q_layer.weight', 'decoder.layers.3.trg_trg_att.v_layer.bias', 'decoder.layers.3.trg_trg_att.v_layer.weight', 'decoder.layers.3.x_layer_norm.bias', 'decoder.layers.3.x_layer_norm.weight', 'decoder.layers.4.dec_layer_norm.bias', 'decoder.layers.4.dec_layer_norm.weight', 'decoder.layers.4.feed_forward.layer_norm.bias', 'decoder.layers.4.feed_forward.layer_norm.weight', 'decoder.layers.4.feed_forward.pwff_layer.0.bias', 'decoder.layers.4.feed_forward.pwff_layer.0.weight', 'decoder.layers.4.feed_forward.pwff_layer.3.bias', 'decoder.layers.4.feed_forward.pwff_layer.3.weight', 'decoder.layers.4.src_trg_att.k_layer.bias', 'decoder.layers.4.src_trg_att.k_layer.weight', 'decoder.layers.4.src_trg_att.output_layer.bias', 'decoder.layers.4.src_trg_att.output_layer.weight', 'decoder.layers.4.src_trg_att.q_layer.bias', 'decoder.layers.4.src_trg_att.q_layer.weight', 'decoder.layers.4.src_trg_att.v_layer.bias', 'decoder.layers.4.src_trg_att.v_layer.weight', 'decoder.layers.4.trg_trg_att.k_layer.bias', 'decoder.layers.4.trg_trg_att.k_layer.weight', 'decoder.layers.4.trg_trg_att.output_layer.bias', 'decoder.layers.4.trg_trg_att.output_layer.weight', 'decoder.layers.4.trg_trg_att.q_layer.bias', 'decoder.layers.4.trg_trg_att.q_layer.weight', 'decoder.layers.4.trg_trg_att.v_layer.bias', 'decoder.layers.4.trg_trg_att.v_layer.weight', 'decoder.layers.4.x_layer_norm.bias', 'decoder.layers.4.x_layer_norm.weight', 'decoder.layers.5.dec_layer_norm.bias', 'decoder.layers.5.dec_layer_norm.weight', 'decoder.layers.5.feed_forward.layer_norm.bias', 'decoder.layers.5.feed_forward.layer_norm.weight', 'decoder.layers.5.feed_forward.pwff_layer.0.bias', 'decoder.layers.5.feed_forward.pwff_layer.0.weight', 'decoder.layers.5.feed_forward.pwff_layer.3.bias', 'decoder.layers.5.feed_forward.pwff_layer.3.weight', 'decoder.layers.5.src_trg_att.k_layer.bias', 'decoder.layers.5.src_trg_att.k_layer.weight', 'decoder.layers.5.src_trg_att.output_layer.bias', 'decoder.layers.5.src_trg_att.output_layer.weight', 'decoder.layers.5.src_trg_att.q_layer.bias', 'decoder.layers.5.src_trg_att.q_layer.weight', 'decoder.layers.5.src_trg_att.v_layer.bias', 'decoder.layers.5.src_trg_att.v_layer.weight', 'decoder.layers.5.trg_trg_att.k_layer.bias', 'decoder.layers.5.trg_trg_att.k_layer.weight', 'decoder.layers.5.trg_trg_att.output_layer.bias', 'decoder.layers.5.trg_trg_att.output_layer.weight', 'decoder.layers.5.trg_trg_att.q_layer.bias', 'decoder.layers.5.trg_trg_att.q_layer.weight', 'decoder.layers.5.trg_trg_att.v_layer.bias', 'decoder.layers.5.trg_trg_att.v_layer.weight', 'decoder.layers.5.x_layer_norm.bias', 'decoder.layers.5.x_layer_norm.weight', 'encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.feed_forward.layer_norm.bias', 'encoder.layers.0.feed_forward.layer_norm.weight', 'encoder.layers.0.feed_forward.pwff_layer.0.bias', 'encoder.layers.0.feed_forward.pwff_layer.0.weight', 'encoder.layers.0.feed_forward.pwff_layer.3.bias', 'encoder.layers.0.feed_forward.pwff_layer.3.weight', 'encoder.layers.0.layer_norm.bias', 'encoder.layers.0.layer_norm.weight', 'encoder.layers.0.src_src_att.k_layer.bias', 'encoder.layers.0.src_src_att.k_layer.weight', 'encoder.layers.0.src_src_att.output_layer.bias', 'encoder.layers.0.src_src_att.output_layer.weight', 'encoder.layers.0.src_src_att.q_layer.bias', 'encoder.layers.0.src_src_att.q_layer.weight', 'encoder.layers.0.src_src_att.v_layer.bias', 'encoder.layers.0.src_src_att.v_layer.weight', 'encoder.layers.1.feed_forward.layer_norm.bias', 'encoder.layers.1.feed_forward.layer_norm.weight', 'encoder.layers.1.feed_forward.pwff_layer.0.bias', 'encoder.layers.1.feed_forward.pwff_layer.0.weight', 'encoder.layers.1.feed_forward.pwff_layer.3.bias', 'encoder.layers.1.feed_forward.pwff_layer.3.weight', 'encoder.layers.1.layer_norm.bias', 'encoder.layers.1.layer_norm.weight', 'encoder.layers.1.src_src_att.k_layer.bias', 'encoder.layers.1.src_src_att.k_layer.weight', 'encoder.layers.1.src_src_att.output_layer.bias', 'encoder.layers.1.src_src_att.output_layer.weight', 'encoder.layers.1.src_src_att.q_layer.bias', 'encoder.layers.1.src_src_att.q_layer.weight', 'encoder.layers.1.src_src_att.v_layer.bias', 'encoder.layers.1.src_src_att.v_layer.weight', 'encoder.layers.2.feed_forward.layer_norm.bias', 'encoder.layers.2.feed_forward.layer_norm.weight', 'encoder.layers.2.feed_forward.pwff_layer.0.bias', 'encoder.layers.2.feed_forward.pwff_layer.0.weight', 'encoder.layers.2.feed_forward.pwff_layer.3.bias', 'encoder.layers.2.feed_forward.pwff_layer.3.weight', 'encoder.layers.2.layer_norm.bias', 'encoder.layers.2.layer_norm.weight', 'encoder.layers.2.src_src_att.k_layer.bias', 'encoder.layers.2.src_src_att.k_layer.weight', 'encoder.layers.2.src_src_att.output_layer.bias', 'encoder.layers.2.src_src_att.output_layer.weight', 'encoder.layers.2.src_src_att.q_layer.bias', 'encoder.layers.2.src_src_att.q_layer.weight', 'encoder.layers.2.src_src_att.v_layer.bias', 'encoder.layers.2.src_src_att.v_layer.weight', 'encoder.layers.3.feed_forward.layer_norm.bias', 'encoder.layers.3.feed_forward.layer_norm.weight', 'encoder.layers.3.feed_forward.pwff_layer.0.bias', 'encoder.layers.3.feed_forward.pwff_layer.0.weight', 'encoder.layers.3.feed_forward.pwff_layer.3.bias', 'encoder.layers.3.feed_forward.pwff_layer.3.weight', 'encoder.layers.3.layer_norm.bias', 'encoder.layers.3.layer_norm.weight', 'encoder.layers.3.src_src_att.k_layer.bias', 'encoder.layers.3.src_src_att.k_layer.weight', 'encoder.layers.3.src_src_att.output_layer.bias', 'encoder.layers.3.src_src_att.output_layer.weight', 'encoder.layers.3.src_src_att.q_layer.bias', 'encoder.layers.3.src_src_att.q_layer.weight', 'encoder.layers.3.src_src_att.v_layer.bias', 'encoder.layers.3.src_src_att.v_layer.weight', 'encoder.layers.4.feed_forward.layer_norm.bias', 'encoder.layers.4.feed_forward.layer_norm.weight', 'encoder.layers.4.feed_forward.pwff_layer.0.bias', 'encoder.layers.4.feed_forward.pwff_layer.0.weight', 'encoder.layers.4.feed_forward.pwff_layer.3.bias', 'encoder.layers.4.feed_forward.pwff_layer.3.weight', 'encoder.layers.4.layer_norm.bias', 'encoder.layers.4.layer_norm.weight', 'encoder.layers.4.src_src_att.k_layer.bias', 'encoder.layers.4.src_src_att.k_layer.weight', 'encoder.layers.4.src_src_att.output_layer.bias', 'encoder.layers.4.src_src_att.output_layer.weight', 'encoder.layers.4.src_src_att.q_layer.bias', 'encoder.layers.4.src_src_att.q_layer.weight', 'encoder.layers.4.src_src_att.v_layer.bias', 'encoder.layers.4.src_src_att.v_layer.weight', 'encoder.layers.5.feed_forward.layer_norm.bias', 'encoder.layers.5.feed_forward.layer_norm.weight', 'encoder.layers.5.feed_forward.pwff_layer.0.bias', 'encoder.layers.5.feed_forward.pwff_layer.0.weight', 'encoder.layers.5.feed_forward.pwff_layer.3.bias', 'encoder.layers.5.feed_forward.pwff_layer.3.weight', 'encoder.layers.5.layer_norm.bias', 'encoder.layers.5.layer_norm.weight', 'encoder.layers.5.src_src_att.k_layer.bias', 'encoder.layers.5.src_src_att.k_layer.weight', 'encoder.layers.5.src_src_att.output_layer.bias', 'encoder.layers.5.src_src_att.output_layer.weight', 'encoder.layers.5.src_src_att.q_layer.bias', 'encoder.layers.5.src_src_att.q_layer.weight', 'encoder.layers.5.src_src_att.v_layer.bias', 'encoder.layers.5.src_src_att.v_layer.weight', 'src_embed.lut.weight']
26
+ 2024-01-16 17:28:16,421 - INFO - joeynmt.prediction - Loading model from models/iwslt14_prompt/avg5.ckpt
27
+ 2024-01-16 17:28:24,668 - INFO - joeynmt.prediction - DataParallelWrapper(
28
+ (module): DataParallel(
29
+ (module): Model(
30
+ encoder=TransformerEncoder(num_layers=6, num_heads=8, alpha=1.0, layer_norm="pre", activation=ReLU()),
31
+ decoder=TransformerDecoder(num_layers=6, num_heads=8, alpha=1.0, layer_norm="pre", activation=ReLU()),
32
+ src_embed=Embeddings(embedding_dim=1024, vocab_size=32000),
33
+ trg_embed=Embeddings(embedding_dim=1024, vocab_size=32000),
34
+ loss_function=XentLoss(criterion=KLDivLoss(), smoothing=0.1))
35
+ )
36
+ )
37
+ 2024-01-16 17:28:24,681 - INFO - joeynmt.prediction - Decoding on test set... (device: cuda, n_gpu: 2, use_ddp: False, fp16: True)
38
+ 2024-01-16 17:28:24,681 - INFO - joeynmt.prediction - Predicting 6743 example(s)... (Beam search with beam_size=5, beam_alpha=1.0, n_best=1, min_output_length=1, max_output_length=512, return_prob='none', generate_unk=True, repetition_penalty=-1, no_repeat_ngram_size=-1)
39
+ 2024-01-16 18:19:58,646 - INFO - joeynmt.prediction - Generation took 3093.9641[sec].
40
+ 2024-01-16 18:20:00,112 - INFO - joeynmt.metrics - nrefs:1|case:lc|eff:no|tok:13a|smooth:exp|version:2.4.0
41
+ 2024-01-16 18:20:00,115 - INFO - joeynmt.prediction - Evaluation result (beam search): bleu: 35.28, 1.3136[sec]
42
+ 2024-01-16 18:20:00,128 - INFO - joeynmt.prediction - Translations saved to: iwslt14_prompt/hyp.test.
train.log ADDED
The diff for this file is too large to render. See raw diff
 
trg_vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
validations.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Steps: 1000 loss: 5.33320 acc: 0.13981 ppl: 207.09943 bleu: 0.01656 LR: 0.00002000 *
2
+ Steps: 2000 loss: 4.71079 acc: 0.19451 ppl: 111.13951 bleu: 0.55809 LR: 0.00004000 *
3
+ Steps: 3000 loss: 4.20917 acc: 0.26398 ppl: 67.30097 bleu: 2.02249 LR: 0.00006000 *
4
+ Steps: 4000 loss: 3.50676 acc: 0.36811 ppl: 33.33998 bleu: 6.85361 LR: 0.00008000 *
5
+ Steps: 5000 loss: 3.10499 acc: 0.42404 ppl: 22.30898 bleu: 12.10052 LR: 0.00010000 *
6
+ Steps: 6000 loss: 2.74785 acc: 0.47426 ppl: 15.60911 bleu: 15.55999 LR: 0.00012000 *
7
+ Steps: 7000 loss: 2.59354 acc: 0.49875 ppl: 13.37698 bleu: 19.94671 LR: 0.00014000 *
8
+ Steps: 8000 loss: 2.49535 acc: 0.51680 ppl: 12.12593 bleu: 20.80670 LR: 0.00016000 *
9
+ Steps: 9000 loss: 2.32676 acc: 0.53745 ppl: 10.24473 bleu: 20.65046 LR: 0.00018000
10
+ Steps: 10000 loss: 2.32525 acc: 0.53374 ppl: 10.22927 bleu: 23.89984 LR: 0.00020000 *
11
+ Steps: 11000 loss: 2.26839 acc: 0.54531 ppl: 9.66380 bleu: 24.47579 LR: 0.00019069 *
12
+ Steps: 12000 loss: 2.21879 acc: 0.55346 ppl: 9.19621 bleu: 26.33193 LR: 0.00018257 *
13
+ Steps: 13000 loss: 2.11914 acc: 0.56789 ppl: 8.32394 bleu: 25.80609 LR: 0.00017541
14
+ Steps: 14000 loss: 2.13339 acc: 0.57107 ppl: 8.44348 bleu: 27.53361 LR: 0.00016903 *
15
+ Steps: 15000 loss: 2.10158 acc: 0.57200 ppl: 8.17911 bleu: 25.89823 LR: 0.00016330
16
+ Steps: 16000 loss: 2.01063 acc: 0.58820 ppl: 7.46800 bleu: 26.88964 LR: 0.00015811
17
+ Steps: 17000 loss: 2.03856 acc: 0.58597 ppl: 7.67955 bleu: 28.32236 LR: 0.00015339 *
18
+ Steps: 18000 loss: 2.04342 acc: 0.58335 ppl: 7.71696 bleu: 26.54742 LR: 0.00014907
19
+ Steps: 19000 loss: 2.02208 acc: 0.58512 ppl: 7.55403 bleu: 27.41188 LR: 0.00014510
20
+ Steps: 20000 loss: 1.96965 acc: 0.59832 ppl: 7.16818 bleu: 27.44443 LR: 0.00014142
21
+ Steps: 21000 loss: 2.00185 acc: 0.58957 ppl: 7.40273 bleu: 27.51776 LR: 0.00013801
22
+ Steps: 22000 loss: 1.91933 acc: 0.60069 ppl: 6.81640 bleu: 27.78165 LR: 0.00013484
23
+ Steps: 23000 loss: 1.93601 acc: 0.60446 ppl: 6.93107 bleu: 29.21029 LR: 0.00013188 *
24
+ Steps: 24000 loss: 1.95600 acc: 0.59864 ppl: 7.07096 bleu: 27.79109 LR: 0.00012910
25
+ Steps: 25000 loss: 2.00381 acc: 0.59484 ppl: 7.41723 bleu: 26.95928 LR: 0.00012649
26
+ Steps: 26000 loss: 2.05070 acc: 0.58464 ppl: 7.77331 bleu: 27.30420 LR: 0.00012403
27
+ Steps: 27000 loss: 1.94814 acc: 0.59961 ppl: 7.01565 bleu: 28.05348 LR: 0.00012172
28
+ Steps: 28000 loss: 2.04584 acc: 0.58991 ppl: 7.73569 bleu: 26.18726 LR: 0.00011952
29
+ Steps: 29000 loss: 2.09793 acc: 0.58589 ppl: 8.14927 bleu: 25.79140 LR: 0.00011744