ZhongYing
first commit
503ec99
raw
history blame
4.01 kB
import tensorflow as tf
from featurizers.speech_featurizers import SpeechFeaturizer
from .layers.attention import Attention
L2 = tf.keras.regularizers.l2(1e-6)
def shape_list(x, out_type=tf.int32):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x, out_type=out_type)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def merge_two_last_dims(x):
b, _, f, c = shape_list(x)
return tf.reshape(x, shape=[b, -1, f * c])
class MulSpeechLR(tf.keras.Model):
def __init__(self, name, filters, kernel_size, d_model, rnn_cell, seq_mask, vocab_size, dropout=0.5):
super(MulSpeechLR, self).__init__()
self.filters1 = filters[0]
self.filters2 = filters[1]
self.filters3 = filters[2]
self.kernel_size1 = kernel_size[0]
self.kernel_size2 = kernel_size[1]
self.kernel_size3 = kernel_size[2]
#during training, self.mask can be set true, but during inference, it must be false
self.mask = seq_mask
self.conv1 = tf.keras.layers.Conv2D(filters=self.filters1, kernel_size=self.kernel_size1,
strides=(2,2), padding='same', activation='relu')
self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))
self.conv2 = tf.keras.layers.Conv2D(filters=self.filters2, kernel_size=self.kernel_size2,
strides=(2,2), padding='same', activation='relu')
self.conv3 = tf.keras.layers.Conv2D(filters=self.filters3, kernel_size=self.kernel_size3,
strides=(1,1), padding='same', activation='relu')
self.ln1 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_1")
self.ln2 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_2")
self.ln3 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_3")
# self.linear1 = tf.keras.layers.Dense(d_model*2, name=f"{name}_dense_1")
self.linear2 = tf.keras.layers.Dense(d_model, name=f"{name}_dense_2")
self.rnn = tf.keras.layers.GRU(rnn_cell, return_sequences=True, return_state=True, name=f"{name}_gru")
self.attention = Attention(rnn_cell)
self.class_layer = tf.keras.layers.Dense(vocab_size)
self.res_add = tf.keras.layers.Add(name=f"{name}_add")
def call(self, inputs):
x, x_len = inputs
# mask = tf.cast(tf.sequence_mask(x_len, maxlen=150), dtype=tf.float32)
x = tf.expand_dims(x, axis=-1)
x = self.conv1(x)
x = self.ln1(x)
x = self.maxpool1(x)
x = self.conv2(x)
x = self.ln2(x)
x = self.conv3(x)
x = self.ln3(x)
x = merge_two_last_dims(x)
x, final_state = self.rnn(x)
x = self.attention(x, x_len, self.mask)
x = self.res_add([x, final_state])
output = self.linear2(x)
output = tf.nn.relu(output)
output = self.class_layer(output)
return output
def init_build(self, input_shape):
x = tf.keras.Input(shape=input_shape, dtype= tf.float32)
l = tf.keras.Input(shape=[], dtype=tf.int32)
self([x, l], training=False)
def add_featurizers(self,
speech_featurizer: SpeechFeaturizer):
"""
Function to add featurizer to model to convert to end2end tflite
Args:
speech_featurizer: SpeechFeaturizer instance
"""
self.speech_featurizer = speech_featurizer
@tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.float32)])
def predict_pb(self, signal):
features = self.speech_featurizer.tf_extract(signal)
input_len = tf.expand_dims(tf.shape(features)[0], axis=0)
input = tf.expand_dims(features, axis=0)
output = self([input, input_len], training=False)
output = tf.nn.softmax(output)
output1 = tf.squeeze(output)
output = tf.argmax(output1, axis=-1)
return output, tf.gather(output1, output)