SpeechFlow
/

spoken_language_identification

Audio Classification

Model card Files Files and versions Community

spoken_language_identification / models /layers /attention.py

ZhongYing

first commit

503ec99 almost 2 years ago

1.5 kB


	import tensorflow as tf


	class Attention(tf.keras.layers.Layer):
	def __init__(self, hidden_size,
	attention_size=1,
	name=None,
	**kwargs):
	super().__init__( **kwargs)
	self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size])
	self.w_bias = self.add_variable('w_bias', [attention_size])
	self.bias = self.add_variable('bias', [attention_size])


	def call(self, inputs, inp_len, maxlen=150, mask=None, training=False, **kwargs):
	"""
	inp_len: length of input audio
	maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments
	the input length is 1200s, after downsampling, the sequence length is 1200//8=1500,
	(8=222, see model parameters for details).
	If you change input length and times of dowansampling,
	please reset the maxlen parameter!!!!
	"""
	# In case of Bi-RNN, concatenate the forward and the backward Rnn outputs.
	if isinstance(inputs, tuple):
	inputs = tf.concat(inputs, 2)
	v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias)
	vu = tf.tensordot(v, self.bias, axes=1)
	alphas = tf.nn.softmax(vu) #(B,T)
	if mask is not None:
	alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32)
	output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)

	return output