File size: 4,891 Bytes
0f53151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import math
import copy
import time
import random
import spacy
import numpy as np
import os 

# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim

from model.sublayers import (
                        MultiHeadAttention,
                        PositionalEncoding,
                        PositionwiseFeedForward,
                        Embedding)


class DecoderLayer(nn.Module):
    def __init__(
                self, 
                dk, 
                dv, 
                h,
                dim_multiplier = 4, 
                pdropout = 0.1):
        super().__init__()
        
        # Reference page 5 chapter 3.2.2 Multi-head attention
        dmodel = dk*h
        # Reference page 5 chapter 3.3 positionwise FeedForward
        dff = dmodel * dim_multiplier
        
        # Masked Multi Head Attention
        self.masked_attention = MultiHeadAttention(dk, dv, h, pdropout)
        self.masked_attn_norm = nn.LayerNorm(dmodel)
        
        # Multi head attention
        self.attention = MultiHeadAttention(dk, dv, h, pdropout)
        self.attn_norm = nn.LayerNorm(dmodel)
        
        # Add position FeedForward Network
        self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
        self.ff_norm = nn.LayerNorm(dmodel)
        
        self.dropout = nn.Dropout(p = pdropout)
        
    def forward(self, 
                trg: Tensor, 
                src: Tensor, 
                trg_mask: Tensor, 
                src_mask: Tensor):
        """
        Args:
            trg:          embedded sequences                (batch_size, trg_seq_length, d_model)
            src:          embedded sequences                (batch_size, src_seq_length, d_model)
            trg_mask:     mask for the sequences            (batch_size, 1, trg_seq_length, trg_seq_length)
            src_mask:     mask for the sequences            (batch_size, 1, 1, src_seq_length)

        Returns:
            trg:          sequences after self-attention    (batch_size, trg_seq_length, d_model)
            attn_probs:   self-attention softmax scores     (batch_size, n_heads, trg_seq_length, src_seq_length)
        """
        _trg, attn_probs = self.masked_attention(
                                query = trg, 
                                key = trg, 
                                val = trg, 
                                mask = trg_mask)
        
        # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
        # Actual paper design is the following
        trg = self.masked_attn_norm(trg + self.dropout(_trg))
        
        # Inputs to the decoder attention is given as follows
        # query = previous decoder layer
        # key and val = output of encoder
        # mask = src_mask
        # Reference : page 5 chapter 3.2.3 point 1
        _trg, attn_probs = self.attention(
                                query = trg, 
                                key = src, 
                                val = src, 
                                mask = src_mask)
        trg = self.attn_norm(trg + self.dropout(_trg))
        
        # position-wise feed-forward network
        _trg = self.ff(trg)
        # Perform Add Norm again
        trg = self.ff_norm(trg + self.dropout(_trg))
        return trg, attn_probs
    

class Decoder(nn.Module):
    def __init__(
                self, 
                dk, 
                dv, 
                h, 
                num_decoders, 
                dim_multiplier = 4, 
                pdropout=0.1):
        super().__init__()
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(dk, 
                         dv, 
                         h, 
                         dim_multiplier, 
                         pdropout) for _ in range(num_decoders)
        ])
        
    def forward(self, target_inputs, src_inputs, target_mask, src_mask):
        """
        Input from the Embedding layer
        target_inputs = embedded sequences    (batch_size, trg_seq_length, d_model)
        src_inputs = embedded sequences       (batch_size, src_seq_length, d_model)
        target_mask = mask for the sequences  (batch_size, 1, trg_seq_length, trg_seq_length)
        src_mask = mask for the sequences     (batch_size, 1, 1, src_seq_length)
        """
        target_representation = target_inputs
        
        # Forward pass through decoder stack
        for layer in self.decoder_layers:
            target_representation, attn_probs = layer(
                                    target_representation,
                                    src_inputs, 
                                    target_mask,
                                    src_mask)
        self.attn_probs = attn_probs
        return target_representation