first commit
history blame
1.5 kB
import tensorflow as tf
class Attention(tf.keras.layers.Layer):
def __init__(self, hidden_size,
super().__init__( **kwargs)
self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size])
self.w_bias = self.add_variable('w_bias', [attention_size])
self.bias = self.add_variable('bias', [attention_size])
def call(self, inputs, inp_len, maxlen=150, mask=None, training=False, **kwargs):
inp_len: length of input audio
maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments
the input length is 1200s, after downsampling, the sequence length is 1200//8=1500,
(8=2*2*2, see model parameters for details).
If you change input length and times of dowansampling,
please reset the maxlen parameter!!!!
# In case of Bi-RNN, concatenate the forward and the backward Rnn outputs.
if isinstance(inputs, tuple):
inputs = tf.concat(inputs, 2)
v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias)
vu = tf.tensordot(v, self.bias, axes=1)
alphas = tf.nn.softmax(vu) #(B,T)
if mask is not None:
alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32)
output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
return output