|
import tensorflow as tf |
|
from featurizers.speech_featurizers import SpeechFeaturizer |
|
from .layers.attention import Attention |
|
|
|
|
|
L2 = tf.keras.regularizers.l2(1e-6) |
|
|
|
|
|
def shape_list(x, out_type=tf.int32): |
|
"""Deal with dynamic shape in tensorflow cleanly.""" |
|
static = x.shape.as_list() |
|
dynamic = tf.shape(x, out_type=out_type) |
|
return [dynamic[i] if s is None else s for i, s in enumerate(static)] |
|
|
|
|
|
def merge_two_last_dims(x): |
|
b, _, f, c = shape_list(x) |
|
return tf.reshape(x, shape=[b, -1, f * c]) |
|
|
|
|
|
class MulSpeechLR(tf.keras.Model): |
|
def __init__(self, name, filters, kernel_size, d_model, rnn_cell, seq_mask, vocab_size, dropout=0.5): |
|
super(MulSpeechLR, self).__init__() |
|
self.filters1 = filters[0] |
|
self.filters2 = filters[1] |
|
self.filters3 = filters[2] |
|
self.kernel_size1 = kernel_size[0] |
|
self.kernel_size2 = kernel_size[1] |
|
self.kernel_size3 = kernel_size[2] |
|
|
|
self.mask = seq_mask |
|
self.conv1 = tf.keras.layers.Conv2D(filters=self.filters1, kernel_size=self.kernel_size1, |
|
strides=(2,2), padding='same', activation='relu') |
|
self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)) |
|
|
|
self.conv2 = tf.keras.layers.Conv2D(filters=self.filters2, kernel_size=self.kernel_size2, |
|
strides=(2,2), padding='same', activation='relu') |
|
self.conv3 = tf.keras.layers.Conv2D(filters=self.filters3, kernel_size=self.kernel_size3, |
|
strides=(1,1), padding='same', activation='relu') |
|
self.ln1 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_1") |
|
self.ln2 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_2") |
|
self.ln3 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_3") |
|
|
|
self.linear2 = tf.keras.layers.Dense(d_model, name=f"{name}_dense_2") |
|
self.rnn = tf.keras.layers.GRU(rnn_cell, return_sequences=True, return_state=True, name=f"{name}_gru") |
|
self.attention = Attention(rnn_cell) |
|
self.class_layer = tf.keras.layers.Dense(vocab_size) |
|
self.res_add = tf.keras.layers.Add(name=f"{name}_add") |
|
|
|
|
|
def call(self, inputs): |
|
x, x_len = inputs |
|
|
|
x = tf.expand_dims(x, axis=-1) |
|
x = self.conv1(x) |
|
x = self.ln1(x) |
|
x = self.maxpool1(x) |
|
x = self.conv2(x) |
|
x = self.ln2(x) |
|
x = self.conv3(x) |
|
x = self.ln3(x) |
|
x = merge_two_last_dims(x) |
|
x, final_state = self.rnn(x) |
|
x = self.attention(x, x_len, self.mask) |
|
x = self.res_add([x, final_state]) |
|
output = self.linear2(x) |
|
output = tf.nn.relu(output) |
|
output = self.class_layer(output) |
|
|
|
return output |
|
|
|
|
|
def init_build(self, input_shape): |
|
x = tf.keras.Input(shape=input_shape, dtype= tf.float32) |
|
l = tf.keras.Input(shape=[], dtype=tf.int32) |
|
self([x, l], training=False) |
|
|
|
def add_featurizers(self, |
|
speech_featurizer: SpeechFeaturizer): |
|
""" |
|
Function to add featurizer to model to convert to end2end tflite |
|
Args: |
|
speech_featurizer: SpeechFeaturizer instance |
|
""" |
|
self.speech_featurizer = speech_featurizer |
|
|
|
|
|
@tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.float32)]) |
|
def predict_pb(self, signal): |
|
features = self.speech_featurizer.tf_extract(signal) |
|
input_len = tf.expand_dims(tf.shape(features)[0], axis=0) |
|
input = tf.expand_dims(features, axis=0) |
|
output = self([input, input_len], training=False) |
|
output = tf.nn.softmax(output) |
|
output1 = tf.squeeze(output) |
|
output = tf.argmax(output1, axis=-1) |
|
|
|
return output, tf.gather(output1, output) |