{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.chdir('../../..')\n",
"print(os.getcwd()) # Ensure this is you Amphion root path, otherwise change the above path to you amphion root path\n",
"assert os.path.isfile('./README.md') # make sure the current path is Amphion root path\n",
"import sys\n",
"sys.path.append('.')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2\n",
"# or use your own pretrained weights\n",
"ar_model_path = 'ckpts/valle_ar_mls_196000.bin' # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n",
"nar_model_path = 'ckpts/valle_nar_mls_164000.bin'\n",
"speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"device = 'cpu' # change to 'cuda' if you have gpu"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from models.tts.valle_v2.valle_inference import ValleInference\n",
"# change to device='cuda' to use CUDA GPU for fast inference\n",
"# change \"use_vocos\" to True would give better sound quality\n",
"# If you meet problem with network, you could set \"use_vocos=False\", though would give bad quality\n",
"model = ValleInference(ar_path=ar_model_path, nar_path=nar_model_path, speechtokenizer_path=speechtokenizer_path, device=device)\n",
"# model = ValleInference(use_vocos=False, ar_path=ar_model_path, nar_path=nar_model_path, device='cuda')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# prepare inference data\n",
"import librosa\n",
"import torch\n",
"wav, _ = librosa.load('./egs/tts/VALLE_V2/example.wav', sr=16000)\n",
"wav = torch.tensor(wav, dtype=torch.float32)\n",
"from IPython.display import Audio\n",
"Audio(wav, rate = 16000)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# The transcript of the prompt part\n",
"prompt_transcript_text = 'and keeping eternity before the eyes'\n",
"\n",
"# Here are the words you want the model to output\n",
"target_transcript_text = 'It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications'\n",
"from models.tts.valle_v2.g2p_processor import G2pProcessor\n",
"g2p = G2pProcessor()\n",
"prompt_transcript = g2p(prompt_transcript_text, 'en')[1]\n",
"target_transcript = g2p(target_transcript_text, 'en')[1]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"prompt_transcript = torch.tensor(prompt_transcript).long()\n",
"target_transcript = torch.tensor(target_transcript).long()\n",
"transcript = torch.cat([prompt_transcript, target_transcript], dim=-1)\n",
"batch = {\n",
" 'speech': wav.unsqueeze(0),\n",
" 'phone_ids': transcript.unsqueeze(0),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'speech': tensor([[ 3.0518e-05, 3.0518e-05, 3.0518e-05, ..., -3.0518e-05,\n",
" -3.0518e-05, 3.0518e-05]]),\n",
" 'phone_ids': tensor([[ 5, 28, 149, 72, 219, 134, 127, 170, 115, 147, 219, 113, 185, 91,\n",
" 149, 30, 185, 123, 219, 65, 115, 106, 43, 172, 219, 73, 29, 219,\n",
" 59, 214, 6, 5, 116, 181, 219, 168, 173, 124, 218, 82, 149, 185,\n",
" 175, 219, 28, 219, 210, 200, 149, 30, 106, 64, 72, 219, 104, 173,\n",
" 100, 143, 209, 94, 135, 219, 73, 24, 181, 219, 116, 214, 219, 113,\n",
" 149, 136, 140, 200, 179, 115, 205, 219, 31, 205, 219, 71, 58, 206,\n",
" 91, 175, 219, 131, 85, 149, 88, 100, 178, 30, 145, 219, 180, 24,\n",
" 179, 136, 175, 219, 28, 149, 72, 219, 141, 15, 76, 30, 140, 214,\n",
" 219, 207, 118, 74, 219, 73, 29, 219, 22, 76, 30, 72, 219, 65,\n",
" 155, 149, 30, 175, 219, 31, 205, 219, 65, 127, 115, 147, 219, 125,\n",
" 218, 30, 140, 123, 219, 83, 136, 179, 185, 82, 149, 76, 30, 67,\n",
" 30, 139, 219, 104, 43, 172, 219, 144, 199, 219, 25, 170, 140, 30,\n",
" 136, 100, 178, 30, 149, 214, 6]])}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print the contents of the model input\n",
"# `phone_ids` contains a concatenation of `prompt_transcript` and `target_transcript` \n",
"batch"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"configs = [dict(\n",
" top_p=0.9,\n",
" top_k=5,\n",
" temperature=0.95,\n",
" repeat_penalty=1.0,\n",
" max_length=2000,\n",
" num_beams=1,\n",
")] # model inference hyperparameters\n",
"output_wav = model(batch, configs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[-1.2337e-06, -1.2981e-05, -4.0130e-05, ..., -4.1360e-05,\n",
" 1.1917e-05, -4.2949e-05]]])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_wav # The output wav is a tensor of shape [1,1,T]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"prompt_transcript : and keeping eternity before the eyes\n",
"target_transcript : It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(f'prompt_transcript : {prompt_transcript_text}')\n",
"print(f'target_transcript : {target_transcript_text}')\n",
"Audio(output_wav.squeeze(0), rate = 16000)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import torchaudio\n",
"torchaudio.save('out.wav', output_wav.squeeze(0), 16000)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "amphion",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}