{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.12.0-rc1\n"
]
}
],
"source": [
"import os\n",
"import tensorflow as tf\n",
"print(tf.__version__)\n",
"os.chdir(\"TensorFlowTTS\")\n",
"os.system(\"pip install .\")\n",
"os.chdir(\"..\")\n",
"import sys\n",
"sys.path.append(\"TensorFlowTTS/\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: h5py in c:\\users\\sathishreddy\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.8.0)\n",
"Requirement already satisfied: numpy>=1.14.5 in c:\\users\\sathishreddy\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from h5py) (1.23.5)\n"
]
}
],
"source": [
"!pip install h5py"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (b) MelGAN + STFT Loss"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading MelGAN-STFT model...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\sathishreddy\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\gdown\\cli.py:126: FutureWarning: Option `--id` was deprecated in version 4.3.1 and will be removed in 5.0. You don't need to pass it anymore to use a file ID.\n",
" warnings.warn(\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1WB5iQbk9qB-Y-wO8BU6S2TnRiu4VU5ys\n",
"To: c:\\Users\\sathishreddy\\Desktop\\SpeechProcessing_EndSem_Proj\\melgan.stft-2M.h5\n",
"\n",
" 0%| | 0.00/17.1M [00:00, ?B/s]\n",
" 3%|▎ | 524k/17.1M [00:01<00:47, 350kB/s]\n",
" 6%|▌ | 1.05M/17.1M [00:02<00:30, 535kB/s]\n",
" 9%|▉ | 1.57M/17.1M [00:03<00:28, 544kB/s]\n",
" 12%|█▏ | 2.10M/17.1M [00:03<00:23, 637kB/s]\n",
" 15%|█▌ | 2.62M/17.1M [00:04<00:22, 650kB/s]\n",
" 18%|█▊ | 3.15M/17.1M [00:05<00:21, 650kB/s]\n",
" 21%|██▏ | 3.67M/17.1M [00:06<00:25, 524kB/s]\n",
" 24%|██▍ | 4.19M/17.1M [00:07<00:26, 493kB/s]\n",
" 28%|██▊ | 4.72M/17.1M [00:08<00:24, 510kB/s]\n",
" 31%|███ | 5.24M/17.1M [00:09<00:21, 556kB/s]\n",
" 34%|███▎ | 5.77M/17.1M [00:10<00:18, 611kB/s]\n",
" 37%|███▋ | 6.29M/17.1M [00:11<00:17, 612kB/s]\n",
" 40%|███▉ | 6.82M/17.1M [00:11<00:15, 651kB/s]\n",
" 43%|████▎ | 7.34M/17.1M [00:12<00:15, 637kB/s]\n",
" 46%|████▌ | 7.86M/17.1M [00:13<00:14, 629kB/s]\n",
" 49%|████▉ | 8.39M/17.1M [00:14<00:14, 621kB/s]\n",
" 52%|█████▏ | 8.91M/17.1M [00:15<00:13, 602kB/s]\n",
" 55%|█████▌ | 9.44M/17.1M [00:16<00:13, 587kB/s]\n",
" 58%|█████▊ | 9.96M/17.1M [00:17<00:14, 508kB/s]\n",
" 61%|██████ | 10.5M/17.1M [00:18<00:13, 488kB/s]\n",
" 64%|██████▍ | 11.0M/17.1M [00:19<00:11, 539kB/s]\n",
" 67%|██████▋ | 11.5M/17.1M [00:20<00:10, 557kB/s]\n",
" 70%|███████ | 12.1M/17.1M [00:21<00:08, 588kB/s]\n",
" 73%|███████▎ | 12.6M/17.1M [00:22<00:07, 576kB/s]\n",
" 76%|███████▋ | 13.1M/17.1M [00:23<00:07, 507kB/s]\n",
" 80%|███████▉ | 13.6M/17.1M [00:24<00:07, 501kB/s]\n",
" 83%|████████▎ | 14.2M/17.1M [00:25<00:05, 506kB/s]\n",
" 86%|████████▌ | 14.7M/17.1M [00:27<00:06, 393kB/s]\n",
" 89%|████████▊ | 15.2M/17.1M [00:29<00:05, 362kB/s]\n",
" 92%|█████████▏| 15.7M/17.1M [00:30<00:03, 381kB/s]\n",
" 95%|█████████▍| 16.3M/17.1M [00:31<00:02, 369kB/s]\n",
" 98%|█████████▊| 16.8M/17.1M [00:33<00:00, 389kB/s]\n",
"100%|██████████| 17.1M/17.1M [00:34<00:00, 356kB/s]\n",
"100%|██████████| 17.1M/17.1M [00:34<00:00, 497kB/s]\n",
"c:\\Users\\sathishreddy\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\gdown\\cli.py:126: FutureWarning: Option `--id` was deprecated in version 4.3.1 and will be removed in 5.0. You don't need to pass it anymore to use a file ID.\n",
" warnings.warn(\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1OqdrcHJvtXwNasEZP7KXZwtGUDXMKNkg\n",
"To: c:\\Users\\sathishreddy\\Desktop\\SpeechProcessing_EndSem_Proj\\melgan.stft_config.yml\n",
"\n",
" 0%| | 0.00/1.77k [00:00, ?B/s]\n",
"100%|██████████| 1.77k/1.77k [00:00, ?B/s]\n"
]
}
],
"source": [
"print(\"Downloading MelGAN-STFT model...\")\n",
"!gdown --id {\"1WB5iQbk9qB-Y-wO8BU6S2TnRiu4VU5ys\"} -O melgan.stft-2M.h5\n",
"!gdown --id {\"1OqdrcHJvtXwNasEZP7KXZwtGUDXMKNkg\"} -O melgan.stft_config.yml"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\sathishreddy\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tensorflow_addons\\utils\\tfa_eol_msg.py:23: UserWarning: \n",
"\n",
"TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
"TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
"Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
"\n",
"For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
"\n",
" warnings.warn(\n",
"c:\\Users\\sathishreddy\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"\n",
"import yaml\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import IPython.display as ipd\n",
"\n",
"from tensorflow_tts.inference import TFAutoModel\n",
"from tensorflow_tts.inference import AutoConfig\n",
"from tensorflow_tts.inference import AutoProcessor"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (a) Tacotron 2"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\sathishreddy\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\huggingface_hub\\file_download.py:649: FutureWarning: 'cached_download' is the legacy way to download files from the HF hub, please consider upgrading to 'hf_hub_download'\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n",
"WARNING:tensorflow:`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is deprecated and will be removed in a future release. Please use `tf.keras.layers.BatchNormalization` with parameter `synchronized` set to True.\n"
]
}
],
"source": [
"tacotron2 = TFAutoModel.from_pretrained(\"tensorspeech/tts-tacotron2-ljspeech-en\", name=\"tacotron2\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (b) FastSpeech"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"fastspeech = TFAutoModel.from_pretrained(\"tensorspeech/tts-fastspeech-ljspeech-en\", name=\"fastspeech\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (c) FastSpeech2"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"fastspeech2 = TFAutoModel.from_pretrained(\"tensorspeech/tts-fastspeech2-ljspeech-en\", name=\"fastspeech2\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (d) MelGAN Original"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"melgan = TFAutoModel.from_pretrained(\"tensorspeech/tts-melgan-ljspeech-en\", name=\"melgan\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (e) MelGAN STFT"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"melgan_stft_config = AutoConfig.from_pretrained('TensorFlowTTS/examples/melgan_stft/conf/melgan_stft.v1.yaml')\n",
"melgan_stft = TFAutoModel.from_pretrained(\n",
" config=melgan_stft_config,\n",
" pretrained_path=\"melgan.stft-2M.h5\",\n",
" name=\"melgan_stft\"\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (f) Multi-band MelGAN"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"mb_melgan = TFAutoModel.from_pretrained(\"tensorspeech/tts-mb_melgan-ljspeech-en\", name=\"mb_melgan\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"processor = AutoProcessor.from_pretrained(\"tensorspeech/tts-tacotron2-ljspeech-en\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):\n",
" input_ids = processor.text_to_sequence(input_text)\n",
"\n",
" # text2mel part\n",
" if text2mel_name == \"TACOTRON\":\n",
" _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(\n",
" tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),\n",
" tf.convert_to_tensor([len(input_ids)], tf.int32),\n",
" tf.convert_to_tensor([0], dtype=tf.int32)\n",
" )\n",
" elif text2mel_name == \"FASTSPEECH\":\n",
" mel_before, mel_outputs, duration_outputs = text2mel_model.inference(\n",
" input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),\n",
" speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),\n",
" speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),\n",
" )\n",
" elif text2mel_name == \"FASTSPEECH2\":\n",
" mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(\n",
" tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),\n",
" speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),\n",
" speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),\n",
" f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),\n",
" energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),\n",
" )\n",
" else:\n",
" raise ValueError(\"Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name\")\n",
"\n",
" # vocoder part\n",
" if vocoder_name == \"MELGAN\" or vocoder_name == \"MELGAN-STFT\":\n",
" audio = vocoder_model(mel_outputs)[0, :, 0]\n",
" elif vocoder_name == \"MB-MELGAN\":\n",
" audio = vocoder_model(mel_outputs)[0, :, 0]\n",
" else:\n",
" raise ValueError(\"Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name\")\n",
"\n",
" if text2mel_name == \"TACOTRON\":\n",
" return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()\n",
" else:\n",
" return mel_outputs.numpy(), audio.numpy()\n",
"\n",
"def visualize_attention(alignment_history):\n",
" import matplotlib.pyplot as plt\n",
"\n",
" fig = plt.figure(figsize=(8, 6))\n",
" ax = fig.add_subplot(111)\n",
" ax.set_title(f'Alignment steps')\n",
" im = ax.imshow(\n",
" alignment_history,\n",
" aspect='auto',\n",
" origin='lower',\n",
" interpolation='none')\n",
" fig.colorbar(im, ax=ax)\n",
" xlabel = 'Decoder timestep'\n",
" plt.xlabel(xlabel)\n",
" plt.ylabel('Encoder timestep')\n",
" plt.tight_layout()\n",
" plt.show()\n",
" plt.close()\n",
"\n",
"def visualize_mel_spectrogram(mels):\n",
" mels = tf.reshape(mels, [-1, 80]).numpy()\n",
" fig = plt.figure(figsize=(10, 8))\n",
" ax1 = fig.add_subplot(311)\n",
" ax1.set_title(f'Predicted Mel-after-Spectrogram')\n",
" im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')\n",
" fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)\n",
" plt.show()\n",
" plt.close()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"input_text = \"Kashmir is India's paradise.\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (a) Tacotron2 + MELGAN"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mels, alignment_history, audios = do_synthesis(input_text, tacotron2, melgan, \"TACOTRON\", \"MELGAN\")\n",
"#visualize_attention(alignment_history[0])\n",
"#visualize_mel_spectrogram(mels[0])\n",
"ipd.Audio(audios, rate=22050)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (b) Tacotron2 + MELGAN-STFT"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mels, alignment_history, audios = do_synthesis(input_text, tacotron2, melgan_stft, \"TACOTRON\", \"MELGAN-STFT\")\n",
"#visualize_attention(alignment_history[0])\n",
"#visualize_mel_spectrogram(mels[0])\n",
"ipd.Audio(audios, rate=22050)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (c) Tacotron2 + MB-MELGAN"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mels, alignment_history, audios = do_synthesis(input_text, tacotron2, mb_melgan, \"TACOTRON\", \"MB-MELGAN\")\n",
"#visualize_attention(alignment_history[0])\n",
"#visualize_mel_spectrogram(mels[0])\n",
"ipd.Audio(audios, rate=22050)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### (d) FastSpeech + MB-MELGAN"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"