import tensorflow as tf


class Attention(tf.keras.layers.Layer):
    def __init__(self, hidden_size,
                attention_size=1,
                name=None,
                 **kwargs):
        super().__init__( **kwargs)
        self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size])
        self.w_bias = self.add_variable('w_bias', [attention_size])
        self.bias = self.add_variable('bias', [attention_size])


    def call(self, inputs, inp_len, maxlen=150, mask=None, training=False,  **kwargs):
        """
        inp_len: length of input audio
        maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments
        the input length is 1200s, after downsampling, the sequence length is 1200//8=1500,
        (8=2*2*2, see model parameters for details).
        If you change input length and times of dowansampling,
        please reset the maxlen parameter!!!!
        """
        # In case of Bi-RNN, concatenate the forward and the backward Rnn outputs.
        if isinstance(inputs, tuple):
            inputs = tf.concat(inputs, 2)
        v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias)
        vu = tf.tensordot(v, self.bias, axes=1)
        alphas = tf.nn.softmax(vu)  #(B,T)
        if mask is not None:
            alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32)
        output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)

        return output