|
import streamlit as st |
|
import tensorflow as tf |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from tensorflow_tts.inference import TFAutoModel |
|
from tensorflow_tts.inference import AutoConfig |
|
from tensorflow_tts.inference import AutoProcessor |
|
|
|
st.title("Text-to-Speech Synthesis") |
|
|
|
|
|
model_selection = st.sidebar.selectbox("Select Model", [ |
|
"Tacotron2 + MelGAN", |
|
"Tacotron2 + MelGAN-STFT", |
|
"Tacotron2 + MB-MelGAN", |
|
"FastSpeech + MB-MelGAN", |
|
"FastSpeech + MelGAN-STFT", |
|
"FastSpeech + MelGAN", |
|
"FastSpeech2 + MB-MelGAN", |
|
"FastSpeech2 + MelGAN-STFT", |
|
"FastSpeech2 + MelGAN" |
|
]) |
|
|
|
input_text = st.text_area("Enter Text", value="Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.") |
|
|
|
|
|
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2") |
|
fastspeech = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en", name="fastspeech") |
|
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en", name="fastspeech2") |
|
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan") |
|
melgan_stft_config = AutoConfig.from_pretrained('TensorFlowTTS/examples/melgan_stft/conf/melgan_stft.v1.yaml') |
|
melgan_stft = TFAutoModel.from_pretrained( |
|
config=melgan_stft_config, |
|
pretrained_path="melgan.stft-2M.h5", |
|
name="melgan_stft" |
|
) |
|
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en", name="mb_melgan") |
|
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en") |
|
|
|
def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name): |
|
input_ids = processor.text_to_sequence(input_text) |
|
|
|
if text2mel_name == "TACOTRON": |
|
_, mel_outputs, _, _ = text2mel_model.inference( |
|
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), |
|
tf.convert_to_tensor([len(input_ids)], tf.int32), |
|
tf.convert_to_tensor([0], dtype=tf.int32) |
|
) |
|
elif text2mel_name == "FASTSPEECH": |
|
_, mel_outputs, _ = text2mel_model.inference( |
|
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), |
|
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), |
|
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), |
|
) |
|
elif text2mel_name == "FASTSPEECH2": |
|
_, mel_outputs, _, _, _ = text2mel_model.inference( |
|
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), |
|
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), |
|
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), |
|
f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), |
|
energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), |
|
) |
|
else: |
|
raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name") |
|
|
|
if vocoder_name == "MELGAN" or vocoder_name == "MELGAN-STFT": |
|
audio = vocoder_model(mel_outputs)[0, :, 0] |
|
elif vocoder_name == "MB-MELGAN": |
|
audio = vocoder_model(mel_outputs)[0, :, 0] |
|
else: |
|
raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name") |
|
|
|
return mel_outputs.numpy(), audio.numpy() |
|
|
|
if st.button("Synthesize"): |
|
|
|
if model_selection == "Tacotron2 + MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan, "TACOTRON", "MELGAN") |
|
elif model_selection == "Tacotron2 + MelGAN-STFT": |
|
mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan_stft, "TACOTRON", "MELGAN-STFT") |
|
elif model_selection == "Tacotron2 + MB-MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN") |
|
elif model_selection == "FastSpeech + MB-MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech, mb_melgan, "FASTSPEECH", "MB-MELGAN") |
|
elif model_selection == "FastSpeech + MelGAN-STFT": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan_stft, "FASTSPEECH", "MELGAN-STFT") |
|
elif model_selection == "FastSpeech + MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan, "FASTSPEECH", "MELGAN") |
|
elif model_selection == "FastSpeech2 + MB-MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech2, mb_melgan, "FASTSPEECH2", "MB-MELGAN") |
|
elif model_selection == "FastSpeech2 + MelGAN-STFT": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan_stft, "FASTSPEECH2", "MELGAN-STFT") |
|
elif model_selection == "FastSpeech2 + MelGAN": |
|
mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan, "FASTSPEECH2", "MELGAN") |
|
|
|
|
|
mels = np.reshape(mel_outputs, [-1, 80]) |
|
fig = plt.figure(figsize=(10, 8)) |
|
ax1 = fig.add_subplot(311) |
|
ax1.set_title(f'Predicted Mel-after-Spectrogram') |
|
im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none') |
|
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) |
|
st.pyplot(fig) |
|
|
|
|
|
st.audio(audio, format="audio/wav", sample_rate=22050, start_time=0) |
|
|
|
if __name__ == '__main__': |
|
app() |
|
|