{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Setup" ], "metadata": { "id": "5Eo0Ell3_W2y" } }, { "cell_type": "code", "source": [ "!pip install scipy==1.12.0\n", "!pip install parallel-wavegan" ], "metadata": { "id": "qiqpB5skFEsC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uAyYBAMA-Z_m", "outputId": "a9a14f61-32eb-43b7-b974-8e3d10167d31" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'Persian-MultiSpeaker-Tacotron2'...\n", "remote: Enumerating objects: 538, done.\u001b[K\n", "remote: Counting objects: 100% (65/65), done.\u001b[K\n", "remote: Compressing objects: 100% (28/28), done.\u001b[K\n", "remote: Total 538 (delta 50), reused 44 (delta 37), pack-reused 473 (from 1)\u001b[K\n", "Receiving objects: 100% (538/538), 81.71 MiB | 30.56 MiB/s, done.\n", "Resolving deltas: 100% (168/168), done.\n", "Cloning into 'Persian-Tacotron2-on-ManaTTS'...\n", "remote: Enumerating objects: 71, done.\u001b[K\n", "remote: Counting objects: 100% (67/67), done.\u001b[K\n", "remote: Compressing objects: 100% (66/66), done.\u001b[K\n", "remote: Total 71 (delta 16), reused 0 (delta 0), pack-reused 4 (from 1)\u001b[K\n", "Unpacking objects: 100% (71/71), 161.58 KiB | 4.62 MiB/s, done.\n" ] } ], "source": [ "!git clone https://github.com/MahtaFetrat/Persian-MultiSpeaker-Tacotron2.git\n", "!git clone https://huggingface.co./MahtaFetrat/Persian-Tacotron2-on-ManaTTS" ] }, { "cell_type": "code", "source": [ "!mkdir /content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models\n", "!cp /content/Persian-MultiSpeaker-Tacotron2/saved_models/default/encoder.pt /content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/encoder.pt" ], "metadata": { "id": "l3IXShRTAIJN" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "from parallel_wavegan.utils import download_pretrained_model\n", "download_pretrained_model(\"vctk_hifigan.v1\", \".\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 122 }, "id": "51XBcgeQFfNc", "outputId": "73719bba-acb7-47ab-9f2b-b9fdef78dc0a" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Downloading...\n", "From (original): https://drive.google.com/uc?id=1oVOC4Vf0DYLdDp4r7GChfgj7Xh5xd0ex\n", "From (redirected): https://drive.google.com/uc?id=1oVOC4Vf0DYLdDp4r7GChfgj7Xh5xd0ex&confirm=t&uuid=3b842229-0fdd-4c57-bf33-3c8528810490\n", "To: /content/vctk_hifigan.v1.tar.gz\n", "100%|██████████| 916M/916M [00:12<00:00, 73.7MB/s]\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "'./vctk_hifigan.v1/checkpoint-2500000steps.pkl'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "!mv /content/vctk_hifigan.v1/checkpoint-2500000steps.pkl /content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/vocoder_HiFiGAN.pkl\n", "!mv /content/vctk_hifigan.v1/config.yml /content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/config.yml\n", "!rm -r /content/vctk_hifigan.v1 /content/vctk_hifigan.v1.tar.gz /content/vctk_hifigan.v1.tar.gz.lock\n", "!mv /content/Persian-Tacotron2-on-ManaTTS/synthesizer.pt /content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/\n", "!mkdir /content/Persian-MultiSpeaker-Tacotron2/results" ], "metadata": { "id": "bl6InGC8GOmn" }, "execution_count": 10, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Inference" ], "metadata": { "id": "MYd6GObY_Zxr" } }, { "cell_type": "code", "source": [ "!cd /content/Persian-MultiSpeaker-Tacotron2 && python3 inference.py --vocoder \"HiFiGAN\" --text \"مدل تولید گفتار با دادگان نسل مانا\" --ref_wav_path \"/content/Persian-Tacotron2-on-ManaTTS/sample.wav\" --test_name \"test_output\"" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HDBQ-tPK_G9X", "outputId": "107a11b6-9d99-481c-8f80-70c157c96a5e" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "/content/Persian-MultiSpeaker-Tacotron2/encoder/audio.py:13: UserWarning: Unable to import 'webrtcvad'. This package enables noise removal and is recommended.
  warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
Loaded encoder "/content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/encoder.pt" trained to step 1564501
Synthesizer using device: cpu
/usr/local/lib/python3.11/dist-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim) Trainable Parameters: 30.901M   We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " checkpoint = torch.load(str(path), map_location=device)\n", "Loaded synthesizer \"/content/Persian-MultiSpeaker-Tacotron2/saved_models/final_models/synthesizer.pt\" trained to step 300000\n", "\n", "| Generating 1/1\n", "\n", "\n", "Done.\n", "\n", "\n", "wav file is saved.\n" ] } ] }, { "cell_type": "code", "source": [ "from IPython.display import Audio\n", "Audio('/content/Persian-MultiSpeaker-Tacotron2/results/test_output.wav')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "U_GP3Qzn8kFB", "outputId": "ba729327-acec-4a59-e1ac-dea465652b4f" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 12 } ] } ] }