{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "aAGQPfgYIR23" }, "source": [ "### Install packages and download models" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zDPW5uSpISd2", "outputId": "6463ff79-18d5-4071-c6ad-01947beeb368" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ ] } ], "source": [ "%%shell\n", "git clone https://github.com/yl4579/StyleTTS2.git\n", "cd StyleTTS2\n", "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n", "sudo apt-get install espeak-ng\n", "git-lfs clone https://huggingface.co./yl4579/StyleTTS2-LibriTTS\n", "mv StyleTTS2-LibriTTS/Models .\n", "mv StyleTTS2-LibriTTS/reference_audio.zip .\n", "unzip reference_audio.zip\n", "mv reference_audio Demo/reference_audio" ] }, { "cell_type": "markdown", "metadata": { "id": "eJdB_nCOIVIN" }, "source": [ "### Load models" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cha8Tr2uJwN0" }, "outputs": [], "source": [ "import nltk\n", "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qoow8Wd8ITtm" }, "outputs": [], "source": [ "%cd StyleTTS2\n", "\n", "import torch\n", "torch.manual_seed(0)\n", "torch.backends.cudnn.benchmark = False\n", "torch.backends.cudnn.deterministic = True\n", "\n", "import random\n", "random.seed(0)\n", "\n", "import numpy as np\n", "np.random.seed(0)\n", "\n", "# load packages\n", "import time\n", "import random\n", "import yaml\n", "from munch import Munch\n", "import numpy as np\n", "import torch\n", "from torch import nn\n", "import torch.nn.functional as F\n", "import torchaudio\n", "import librosa\n", "from nltk.tokenize import word_tokenize\n", "\n", "from models import *\n", "from utils import *\n", "from text_utils import TextCleaner\n", "textclenaer = TextCleaner()\n", "\n", "%matplotlib inline\n", "\n", "to_mel = torchaudio.transforms.MelSpectrogram(\n", " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n", "mean, std = -4, 4\n", "\n", "def length_to_mask(lengths):\n", " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n", " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n", " return mask\n", "\n", "def preprocess(wave):\n", " wave_tensor = torch.from_numpy(wave).float()\n", " mel_tensor = to_mel(wave_tensor)\n", " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n", " return mel_tensor\n", "\n", "def compute_style(path):\n", " wave, sr = librosa.load(path, sr=24000)\n", " audio, index = librosa.effects.trim(wave, top_db=30)\n", " if sr != 24000:\n", " audio = librosa.resample(audio, sr, 24000)\n", " mel_tensor = preprocess(audio).to(device)\n", "\n", " with torch.no_grad():\n", " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n", " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n", "\n", " return torch.cat([ref_s, ref_p], dim=1)\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "\n", "# load phonemizer\n", "import phonemizer\n", "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n", "\n", "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n", "\n", "# load pretrained ASR model\n", "ASR_config = config.get('ASR_config', False)\n", "ASR_path = config.get('ASR_path', False)\n", "text_aligner = load_ASR_models(ASR_path, ASR_config)\n", "\n", "# load pretrained F0 model\n", "F0_path = config.get('F0_path', False)\n", "pitch_extractor = load_F0_models(F0_path)\n", "\n", "# load BERT model\n", "from Utils.PLBERT.util import load_plbert\n", "BERT_path = config.get('PLBERT_dir', False)\n", "plbert = load_plbert(BERT_path)\n", "\n", "model_params = recursive_munch(config['model_params'])\n", "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n", "_ = [model[key].eval() for key in model]\n", "_ = [model[key].to(device) for key in model]\n", "\n", "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n", "params = params_whole['net']\n", "\n", "for key in model:\n", " if key in params:\n", " print('%s loaded' % key)\n", " try:\n", " model[key].load_state_dict(params[key])\n", " except:\n", " from collections import OrderedDict\n", " state_dict = params[key]\n", " new_state_dict = OrderedDict()\n", " for k, v in state_dict.items():\n", " name = k[7:] # remove `module.`\n", " new_state_dict[name] = v\n", " # load params\n", " model[key].load_state_dict(new_state_dict, strict=False)\n", "# except:\n", "# _load(params[key], model[key])\n", "_ = [model[key].eval() for key in model]\n", "\n", "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n", "\n", "sampler = DiffusionSampler(\n", " model.diffusion.diffusion,\n", " sampler=ADPM2Sampler(),\n", " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n", " clamp=False\n", ")\n", "\n", "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n", " text = text.strip()\n", " ps = global_phonemizer.phonemize([text])\n", " ps = word_tokenize(ps[0])\n", " ps = ' '.join(ps)\n", " tokens = textclenaer(ps)\n", " tokens.insert(0, 0)\n", " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n", "\n", " with torch.no_grad():\n", " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n", " text_mask = length_to_mask(input_lengths).to(device)\n", "\n", " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n", " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n", " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n", "\n", " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n", " embedding=bert_dur,\n", " embedding_scale=embedding_scale,\n", " features=ref_s, # reference from the same speaker as the embedding\n", " num_steps=diffusion_steps).squeeze(1)\n", "\n", "\n", " s = s_pred[:, 128:]\n", " ref = s_pred[:, :128]\n", "\n", " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n", " s = beta * s + (1 - beta) * ref_s[:, 128:]\n", "\n", " d = model.predictor.text_encoder(d_en,\n", " s, input_lengths, text_mask)\n", "\n", " x, _ = model.predictor.lstm(d)\n", " duration = model.predictor.duration_proj(x)\n", "\n", " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", " for i in range(pred_aln_trg.size(0)):\n", " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n", " c_frame += int(pred_dur[i].data)\n", "\n", " # encode prosody\n", " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(en)\n", " asr_new[:, :, 0] = en[:, :, 0]\n", " asr_new[:, :, 1:] = en[:, :, 0:-1]\n", " en = asr_new\n", "\n", " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n", "\n", " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(asr)\n", " asr_new[:, :, 0] = asr[:, :, 0]\n", " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n", " asr = asr_new\n", "\n", " out = model.decoder(asr,\n", " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n", "\n", "\n", " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n", "\n", "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n", " text = text.strip()\n", " ps = global_phonemizer.phonemize([text])\n", " ps = word_tokenize(ps[0])\n", " ps = ' '.join(ps)\n", " ps = ps.replace('``', '\"')\n", " ps = ps.replace(\"''\", '\"')\n", "\n", " tokens = textclenaer(ps)\n", " tokens.insert(0, 0)\n", " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n", "\n", " with torch.no_grad():\n", " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n", " text_mask = length_to_mask(input_lengths).to(device)\n", "\n", " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n", " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n", " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n", "\n", " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n", " embedding=bert_dur,\n", " embedding_scale=embedding_scale,\n", " features=ref_s, # reference from the same speaker as the embedding\n", " num_steps=diffusion_steps).squeeze(1)\n", "\n", " if s_prev is not None:\n", " # convex combination of previous and current style\n", " s_pred = t * s_prev + (1 - t) * s_pred\n", "\n", " s = s_pred[:, 128:]\n", " ref = s_pred[:, :128]\n", "\n", " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n", " s = beta * s + (1 - beta) * ref_s[:, 128:]\n", "\n", " s_pred = torch.cat([ref, s], dim=-1)\n", "\n", " d = model.predictor.text_encoder(d_en,\n", " s, input_lengths, text_mask)\n", "\n", " x, _ = model.predictor.lstm(d)\n", " duration = model.predictor.duration_proj(x)\n", "\n", " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", " for i in range(pred_aln_trg.size(0)):\n", " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n", " c_frame += int(pred_dur[i].data)\n", "\n", " # encode prosody\n", " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(en)\n", " asr_new[:, :, 0] = en[:, :, 0]\n", " asr_new[:, :, 1:] = en[:, :, 0:-1]\n", " en = asr_new\n", "\n", " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n", "\n", " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(asr)\n", " asr_new[:, :, 0] = asr[:, :, 0]\n", " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n", " asr = asr_new\n", "\n", " out = model.decoder(asr,\n", " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n", "\n", "\n", " return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later\n", "\n", "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n", " text = text.strip()\n", " ps = global_phonemizer.phonemize([text])\n", " ps = word_tokenize(ps[0])\n", " ps = ' '.join(ps)\n", "\n", " tokens = textclenaer(ps)\n", " tokens.insert(0, 0)\n", " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n", "\n", " ref_text = ref_text.strip()\n", " ps = global_phonemizer.phonemize([ref_text])\n", " ps = word_tokenize(ps[0])\n", " ps = ' '.join(ps)\n", "\n", " ref_tokens = textclenaer(ps)\n", " ref_tokens.insert(0, 0)\n", " ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n", "\n", "\n", " with torch.no_grad():\n", " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n", " text_mask = length_to_mask(input_lengths).to(device)\n", "\n", " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n", " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n", " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n", "\n", " ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n", " ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n", " ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n", " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n", " embedding=bert_dur,\n", " embedding_scale=embedding_scale,\n", " features=ref_s, # reference from the same speaker as the embedding\n", " num_steps=diffusion_steps).squeeze(1)\n", "\n", "\n", " s = s_pred[:, 128:]\n", " ref = s_pred[:, :128]\n", "\n", " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n", " s = beta * s + (1 - beta) * ref_s[:, 128:]\n", "\n", " d = model.predictor.text_encoder(d_en,\n", " s, input_lengths, text_mask)\n", "\n", " x, _ = model.predictor.lstm(d)\n", " duration = model.predictor.duration_proj(x)\n", "\n", " duration = torch.sigmoid(duration).sum(axis=-1)\n", " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", "\n", "\n", " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", " c_frame = 0\n", " for i in range(pred_aln_trg.size(0)):\n", " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n", " c_frame += int(pred_dur[i].data)\n", "\n", " # encode prosody\n", " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(en)\n", " asr_new[:, :, 0] = en[:, :, 0]\n", " asr_new[:, :, 1:] = en[:, :, 0:-1]\n", " en = asr_new\n", "\n", " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n", "\n", " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n", " if model_params.decoder.type == \"hifigan\":\n", " asr_new = torch.zeros_like(asr)\n", " asr_new[:, :, 0] = asr[:, :, 0]\n", " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n", " asr = asr_new\n", "\n", " out = model.decoder(asr,\n", " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n", "\n", "\n", " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n" ] }, { "cell_type": "markdown", "metadata": { "id": "32S6U0LyJbCA" }, "source": [ "### Synthesize speech" ] }, { "cell_type": "markdown", "metadata": { "id": "ehK_0daMJdk_" }, "source": [ "#### Basic synthesis (5 diffusion steps, seen speakers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SJs2x41MJhM-" }, "outputs": [], "source": [ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xuqIJe-IJb7A" }, "outputs": [], "source": [ "reference_dicts = {}\n", "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n", "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "H3ra3IxJJmF0" }, "outputs": [], "source": [ "noise = torch.randn(1,1,256).to(device)\n", "for k, path in reference_dicts.items():\n", " ref_s = compute_style(path)\n", " start = time.time()\n", " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print(f\"RTF = {rtf:5f}\")\n", " import IPython.display as ipd\n", " print(k + ' Synthesized:')\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print('Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "aB3wUz6yJ-P_" }, "source": [ "#### With higher diffusion steps (more diverse)\n", "\n", "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lF27XUo4JrKk" }, "outputs": [], "source": [ "noise = torch.randn(1,1,256).to(device)\n", "for k, path in reference_dicts.items():\n", " ref_s = compute_style(path)\n", " start = time.time()\n", " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print(f\"RTF = {rtf:5f}\")\n", " import IPython.display as ipd\n", " print(k + ' Synthesized:')\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print(k + ' Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "pFT_vmJcKDs1" }, "source": [ "#### Basic synthesis (5 diffusion steps, unseen speakers)\n", "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HvNAeGPEKAWN" }, "outputs": [], "source": [ "reference_dicts = {}\n", "# format: (path, text)\n", "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n", "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n", "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n", "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mFnyvYp5KAYN" }, "outputs": [], "source": [ "noise = torch.randn(1,1,256).to(device)\n", "for k, v in reference_dicts.items():\n", " path, text = v\n", " ref_s = compute_style(path)\n", " start = time.time()\n", " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print(f\"RTF = {rtf:5f}\")\n", " import IPython.display as ipd\n", " print(k + ' Synthesized: ' + text)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print(k + ' Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "QBZ53BQtKNQ6" }, "source": [ "### Speech expressiveness\n", "\n", "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training.\n", "\n", "#### With `embedding_scale=1`\n", "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5FwE9CefKQk6" }, "outputs": [], "source": [ "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0CKMI0ZsKUDh" }, "outputs": [], "source": [ "texts = {}\n", "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n", "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n", "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n", "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n", "\n", "for k,v in texts.items():\n", " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n", " print(k + \": \")\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "reemQKVEKWAZ" }, "source": [ "#### With `embedding_scale=2`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "npIAiAUvKYGv" }, "outputs": [], "source": [ "texts = {}\n", "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n", "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n", "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n", "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n", "\n", "for k,v in texts.items():\n", " noise = torch.randn(1,1,256).to(device)\n", " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n", " print(k + \": \")\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "lqKZaXeYKbrH" }, "source": [ "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n", "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VjXuRCCWKcdN" }, "outputs": [], "source": [ "texts = {}\n", "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n", "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n", "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n", "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n", "\n", "for k,v in texts.items():\n", " noise = torch.randn(1,1,256).to(device)\n", " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n", " print(k + \": \")\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "xrwYXGh0KiIW" }, "source": [ "### Zero-shot speaker adaptation\n", "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance." ] }, { "cell_type": "markdown", "metadata": { "id": "ETUywHHmKimE" }, "source": [ "#### Acoustic Environment Maintenance\n", "\n", "Since we want to maintain the acoustic environment in the speaker (timbre), we set `alpha = 0` to make the speaker as close to the reference as possible while only changing the prosody according to the text. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yvjBK3syKnZL" }, "outputs": [], "source": [ "reference_dicts = {}\n", "# format: (path, text)\n", "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n", "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n", "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jclowWp4KomJ" }, "outputs": [], "source": [ "noise = torch.randn(1,1,256).to(device)\n", "for k, v in reference_dicts.items():\n", " path, text = v\n", " ref_s = compute_style(path)\n", " start = time.time()\n", " wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print(f\"RTF = {rtf:5f}\")\n", " import IPython.display as ipd\n", " print('Synthesized: ' + text)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print('Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "LgIm7M93KqVZ" }, "source": [ "#### Speaker’s Emotion Maintenance\n", "\n", "Since we want to maintain the emotion in the speaker (prosody), we set `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yzsNoP6oKulL" }, "outputs": [], "source": [ "reference_dicts = {}\n", "# format: (path, text)\n", "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n", "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n", "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n", "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7h2-9cpfKwr4" }, "outputs": [], "source": [ "noise = torch.randn(1,1,256).to(device)\n", "for k, v in reference_dicts.items():\n", " path, text = v\n", " ref_s = compute_style(path)\n", " start = time.time()\n", " wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print(f\"RTF = {rtf:5f}\")\n", " import IPython.display as ipd\n", " print(k + ' Synthesized: ' + text)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print(k + ' Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "aNS82PGwKzgg" }, "source": [ "### Longform Narration\n", "\n", "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "qs97nL5HK5DH" }, "outputs": [], "source": [ "passage = passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:\"string\"}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "8Mu9whHYK_1b" }, "outputs": [], "source": [ "# seen speaker\n", "path = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n", "s_ref = compute_style(path)\n", "sentences = passage.split('.') # simple split by comma\n", "wavs = []\n", "s_prev = None\n", "for text in sentences:\n", " if text.strip() == \"\": continue\n", " text += '.' # add it back\n", "\n", " wav, s_prev = LFinference(text,\n", " s_prev,\n", " s_ref,\n", " alpha = 0.3,\n", " beta = 0.9, # make it more suitable for the text\n", " t = 0.7,\n", " diffusion_steps=10, embedding_scale=1.5)\n", " wavs.append(wav)\n", "print('Synthesized: ')\n", "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n", "print('Reference: ')\n", "display(ipd.Audio(path, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "81Rh-lgWLB2i" }, "source": [ "### Style Transfer\n", "\n", "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CtIgr5kOLE9a" }, "outputs": [], "source": [ "# reference texts to sample styles\n", "\n", "ref_texts = {}\n", "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n", "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n", "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n", "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MlA1CbhzLIoI" }, "outputs": [], "source": [ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "s_ref = compute_style(path)\n", "\n", "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n", "for k,v in ref_texts.items():\n", " wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n", " print(k + \": \")\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "2M0iaXlkLJUQ" }, "source": [ "### Speech diversity\n", "\n", "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page.\n", "\n", "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n", "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different).\n", "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis).\n" ] }, { "cell_type": "markdown", "metadata": { "id": "tSxZDvF2LNu4" }, "source": [ "#### Default setting (`alpha = 0.3, beta=0.7`)\n", "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AAomGCDZLIt5" }, "outputs": [], "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "ref_s = compute_style(path)\n", "\n", "text = \"How much variation is there?\"\n", "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "BKrSMdgcLQRP" }, "source": [ "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n", "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uo7gVmFoLRfm" }, "outputs": [], "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "ref_s = compute_style(path)\n", "\n", "text = \"How much variation is there?\"\n", "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "metadata": { "id": "nfQ0Xrg9LStd" }, "source": [ "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n", "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cPHz4BzVLT_u" }, "outputs": [], "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "ref_s = compute_style(path)\n", "\n", "text = \"How much variation is there?\"\n", "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ] }, { "cell_type": "markdown", "source": [ "#### Extreme setting (`alpha = 1, beta=1`)\n", "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker." ], "metadata": { "id": "hPKg9eYpL00f" } }, { "cell_type": "code", "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "ref_s = compute_style(path)\n", "\n", "text = \"How much variation is there?\"\n", "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ], "metadata": { "id": "Ei-7JOccL0bF" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "#### No variation (`alpha = 0, beta=0`)\n", "This setting uses 100% of the reference timbre and prosody and do not use the diffusion model at all. This makes the speaker very similar to the reference speaker, but there is no variation." ], "metadata": { "id": "FVMPc3bhL3eL" } }, { "cell_type": "code", "source": [ "# unseen speaker\n", "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n", "ref_s = compute_style(path)\n", "\n", "text = \"How much variation is there?\"\n", "for _ in range(5):\n", " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n", " display(ipd.Audio(wav, rate=24000, normalize=False))" ], "metadata": { "id": "yh1QZ7uhL4wM" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Extra fun!\n", "\n", "You can record your own voice and clone it using pre-trained StyleTTS 2 model here." ], "metadata": { "id": "T0EvkWrAMBDB" } }, { "cell_type": "markdown", "source": [ "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect." ], "metadata": { "id": "R985j5QONY8I" } }, { "cell_type": "code", "source": [ "# all imports\n", "from IPython.display import Javascript\n", "from google.colab import output\n", "from base64 import b64decode\n", "\n", "RECORD = \"\"\"\n", "const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n", "const b2text = blob => new Promise(resolve => {\n", " const reader = new FileReader()\n", " reader.onloadend = e => resolve(e.srcElement.result)\n", " reader.readAsDataURL(blob)\n", "})\n", "var record = time => new Promise(async resolve => {\n", " stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n", " recorder = new MediaRecorder(stream)\n", " chunks = []\n", " recorder.ondataavailable = e => chunks.push(e.data)\n", " recorder.start()\n", " await sleep(time)\n", " recorder.onstop = async ()=>{\n", " blob = new Blob(chunks)\n", " text = await b2text(blob)\n", " resolve(text)\n", " }\n", " recorder.stop()\n", "})\n", "\"\"\"\n", "\n", "def record(sec=3):\n", " display(Javascript(RECORD))\n", " s = output.eval_js('record(%d)' % (sec*1000))\n", " b = b64decode(s.split(',')[1])\n", " with open('audio.wav','wb') as f:\n", " f.write(b)\n", " return 'audio.wav' # or webm ?" ], "metadata": { "id": "MWrFs0KWMBpz" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "#### Please run this cell and speak:" ], "metadata": { "id": "z35qXwM0Nhx1" } }, { "cell_type": "code", "source": [ "print('Speak now for 5 seconds.')\n", "audio = record(sec=5)\n", "import IPython.display as ipd\n", "display(ipd.Audio(audio, rate=24000, normalize=False))" ], "metadata": { "id": "KUEoFyQBMR-8" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "#### Synthesize in your own voice" ], "metadata": { "id": "OQS_7IBpNmM1" } }, { "cell_type": "code", "source": [ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n" ], "metadata": { "cellView": "form", "id": "c0I3LY7vM8Ta" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "reference_dicts = {}\n", "reference_dicts['You'] = audio" ], "metadata": { "id": "80eW-pwxNCxu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "start = time.time()\n", "noise = torch.randn(1,1,256).to(device)\n", "for k, path in reference_dicts.items():\n", " ref_s = compute_style(path)\n", "\n", " wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n", " rtf = (time.time() - start) / (len(wav) / 24000)\n", " print('Speaker: ' + k)\n", " import IPython.display as ipd\n", " print('Synthesized:')\n", " display(ipd.Audio(wav, rate=24000, normalize=False))\n", " print('Reference:')\n", " display(ipd.Audio(path, rate=24000, normalize=False))" ], "metadata": { "id": "yIga6MTuNJaN" }, "execution_count": null, "outputs": [] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [], "collapsed_sections": [ "aAGQPfgYIR23", "eJdB_nCOIVIN", "R985j5QONY8I" ], "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+", "include_colab_link": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }