|
|
|
import tensorflow as tf |
|
|
|
|
|
class Attention(tf.keras.layers.Layer): |
|
def __init__(self, hidden_size, |
|
attention_size=1, |
|
name=None, |
|
**kwargs): |
|
super().__init__( **kwargs) |
|
self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size]) |
|
self.w_bias = self.add_variable('w_bias', [attention_size]) |
|
self.bias = self.add_variable('bias', [attention_size]) |
|
|
|
|
|
def call(self, inputs, inp_len, maxlen=150, mask=None, training=False, **kwargs): |
|
""" |
|
inp_len: length of input audio |
|
maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments |
|
the input length is 1200s, after downsampling, the sequence length is 1200//8=1500, |
|
(8=2*2*2, see model parameters for details). |
|
If you change input length and times of dowansampling, |
|
please reset the maxlen parameter!!!! |
|
""" |
|
|
|
if isinstance(inputs, tuple): |
|
inputs = tf.concat(inputs, 2) |
|
v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias) |
|
vu = tf.tensordot(v, self.bias, axes=1) |
|
alphas = tf.nn.softmax(vu) |
|
if mask is not None: |
|
alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32) |
|
output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1) |
|
|
|
return output |