{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../../..')\n", "print(os.getcwd()) # Ensure this is you Amphion root path, otherwise change the above path to you amphion root path\n", "assert os.path.isfile('./README.md') # make sure the current path is Amphion root path\n", "import sys\n", "sys.path.append('.')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2\n", "# or use your own pretrained weights\n", "ar_model_path = 'ckpts/valle_ar_mls_196000.bin' # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n", "nar_model_path = 'ckpts/valle_nar_mls_164000.bin'\n", "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "device = 'cpu' # change to 'cuda' if you have gpu" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from models.tts.valle_v2.valle_inference import ValleInference\n", "# change to device='cuda' to use CUDA GPU for fast inference\n", "# change \"use_vocos\" to True would give better sound quality\n", "# If you meet problem with network, you could set \"use_vocos=False\", though would give bad quality\n", "model = ValleInference(ar_path=ar_model_path, nar_path=nar_model_path, speechtokenizer_path=speechtokenizer_path, device=device)\n", "# model = ValleInference(use_vocos=False, ar_path=ar_model_path, nar_path=nar_model_path, device='cuda')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# prepare inference data\n", "import librosa\n", "import torch\n", "wav, _ = librosa.load('./egs/tts/VALLE_V2/example.wav', sr=16000)\n", "wav = torch.tensor(wav, dtype=torch.float32)\n", "from IPython.display import Audio\n", "Audio(wav, rate = 16000)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# The transcript of the prompt part\n", "prompt_transcript_text = 'and keeping eternity before the eyes'\n", "\n", "# Here are the words you want the model to output\n", "target_transcript_text = 'It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications'\n", "from models.tts.valle_v2.g2p_processor import G2pProcessor\n", "g2p = G2pProcessor()\n", "prompt_transcript = g2p(prompt_transcript_text, 'en')[1]\n", "target_transcript = g2p(target_transcript_text, 'en')[1]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "prompt_transcript = torch.tensor(prompt_transcript).long()\n", "target_transcript = torch.tensor(target_transcript).long()\n", "transcript = torch.cat([prompt_transcript, target_transcript], dim=-1)\n", "batch = {\n", " 'speech': wav.unsqueeze(0),\n", " 'phone_ids': transcript.unsqueeze(0),\n", "}" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'speech': tensor([[ 3.0518e-05, 3.0518e-05, 3.0518e-05, ..., -3.0518e-05,\n", " -3.0518e-05, 3.0518e-05]]),\n", " 'phone_ids': tensor([[ 5, 28, 149, 72, 219, 134, 127, 170, 115, 147, 219, 113, 185, 91,\n", " 149, 30, 185, 123, 219, 65, 115, 106, 43, 172, 219, 73, 29, 219,\n", " 59, 214, 6, 5, 116, 181, 219, 168, 173, 124, 218, 82, 149, 185,\n", " 175, 219, 28, 219, 210, 200, 149, 30, 106, 64, 72, 219, 104, 173,\n", " 100, 143, 209, 94, 135, 219, 73, 24, 181, 219, 116, 214, 219, 113,\n", " 149, 136, 140, 200, 179, 115, 205, 219, 31, 205, 219, 71, 58, 206,\n", " 91, 175, 219, 131, 85, 149, 88, 100, 178, 30, 145, 219, 180, 24,\n", " 179, 136, 175, 219, 28, 149, 72, 219, 141, 15, 76, 30, 140, 214,\n", " 219, 207, 118, 74, 219, 73, 29, 219, 22, 76, 30, 72, 219, 65,\n", " 155, 149, 30, 175, 219, 31, 205, 219, 65, 127, 115, 147, 219, 125,\n", " 218, 30, 140, 123, 219, 83, 136, 179, 185, 82, 149, 76, 30, 67,\n", " 30, 139, 219, 104, 43, 172, 219, 144, 199, 219, 25, 170, 140, 30,\n", " 136, 100, 178, 30, 149, 214, 6]])}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# print the contents of the model input\n", "# `phone_ids` contains a concatenation of `prompt_transcript` and `target_transcript` \n", "batch" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "configs = [dict(\n", " top_p=0.9,\n", " top_k=5,\n", " temperature=0.95,\n", " repeat_penalty=1.0,\n", " max_length=2000,\n", " num_beams=1,\n", ")] # model inference hyperparameters\n", "output_wav = model(batch, configs)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[[-1.2337e-06, -1.2981e-05, -4.0130e-05, ..., -4.1360e-05,\n", " 1.1917e-05, -4.2949e-05]]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_wav # The output wav is a tensor of shape [1,1,T]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "prompt_transcript : and keeping eternity before the eyes\n", "target_transcript : It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(f'prompt_transcript : {prompt_transcript_text}')\n", "print(f'target_transcript : {target_transcript_text}')\n", "Audio(output_wav.squeeze(0), rate = 16000)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import torchaudio\n", "torchaudio.save('out.wav', output_wav.squeeze(0), 16000)" ] } ], "metadata": { "kernelspec": { "display_name": "amphion", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" } }, "nbformat": 4, "nbformat_minor": 2 }