File size: 4,891 Bytes
0f53151 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import math
import copy
import time
import random
import spacy
import numpy as np
import os
# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim
from model.sublayers import (
MultiHeadAttention,
PositionalEncoding,
PositionwiseFeedForward,
Embedding)
class DecoderLayer(nn.Module):
def __init__(
self,
dk,
dv,
h,
dim_multiplier = 4,
pdropout = 0.1):
super().__init__()
# Reference page 5 chapter 3.2.2 Multi-head attention
dmodel = dk*h
# Reference page 5 chapter 3.3 positionwise FeedForward
dff = dmodel * dim_multiplier
# Masked Multi Head Attention
self.masked_attention = MultiHeadAttention(dk, dv, h, pdropout)
self.masked_attn_norm = nn.LayerNorm(dmodel)
# Multi head attention
self.attention = MultiHeadAttention(dk, dv, h, pdropout)
self.attn_norm = nn.LayerNorm(dmodel)
# Add position FeedForward Network
self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
self.ff_norm = nn.LayerNorm(dmodel)
self.dropout = nn.Dropout(p = pdropout)
def forward(self,
trg: Tensor,
src: Tensor,
trg_mask: Tensor,
src_mask: Tensor):
"""
Args:
trg: embedded sequences (batch_size, trg_seq_length, d_model)
src: embedded sequences (batch_size, src_seq_length, d_model)
trg_mask: mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
src_mask: mask for the sequences (batch_size, 1, 1, src_seq_length)
Returns:
trg: sequences after self-attention (batch_size, trg_seq_length, d_model)
attn_probs: self-attention softmax scores (batch_size, n_heads, trg_seq_length, src_seq_length)
"""
_trg, attn_probs = self.masked_attention(
query = trg,
key = trg,
val = trg,
mask = trg_mask)
# Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
# Actual paper design is the following
trg = self.masked_attn_norm(trg + self.dropout(_trg))
# Inputs to the decoder attention is given as follows
# query = previous decoder layer
# key and val = output of encoder
# mask = src_mask
# Reference : page 5 chapter 3.2.3 point 1
_trg, attn_probs = self.attention(
query = trg,
key = src,
val = src,
mask = src_mask)
trg = self.attn_norm(trg + self.dropout(_trg))
# position-wise feed-forward network
_trg = self.ff(trg)
# Perform Add Norm again
trg = self.ff_norm(trg + self.dropout(_trg))
return trg, attn_probs
class Decoder(nn.Module):
def __init__(
self,
dk,
dv,
h,
num_decoders,
dim_multiplier = 4,
pdropout=0.1):
super().__init__()
self.decoder_layers = nn.ModuleList([
DecoderLayer(dk,
dv,
h,
dim_multiplier,
pdropout) for _ in range(num_decoders)
])
def forward(self, target_inputs, src_inputs, target_mask, src_mask):
"""
Input from the Embedding layer
target_inputs = embedded sequences (batch_size, trg_seq_length, d_model)
src_inputs = embedded sequences (batch_size, src_seq_length, d_model)
target_mask = mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
src_mask = mask for the sequences (batch_size, 1, 1, src_seq_length)
"""
target_representation = target_inputs
# Forward pass through decoder stack
for layer in self.decoder_layers:
target_representation, attn_probs = layer(
target_representation,
src_inputs,
target_mask,
src_mask)
self.attn_probs = attn_probs
return target_representation
|