In [None]:
import os
os.chdir('../../..')
print(os.getcwd()) # Ensure this is you Amphion root path, otherwise change the above path to you amphion root path
assert os.path.isfile('./README.md') # make sure the current path is Amphion root path
import sys
sys.path.append('.')

In [2]:
# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2
# or use your own pretrained weights
ar_model_path = 'ckpts/valle_ar_mls_196000.bin' # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts
nar_model_path = 'ckpts/valle_nar_mls_164000.bin'
speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts

In [None]:
device = 'cpu' # change to 'cuda' if you have gpu

In [None]:
from models.tts.valle_v2.valle_inference import ValleInference
# change to device='cuda' to use CUDA GPU for fast inference
# change "use_vocos" to True would give better sound quality
# If you meet problem with network, you could set "use_vocos=False", though would give bad quality
model = ValleInference(ar_path=ar_model_path, nar_path=nar_model_path, speechtokenizer_path=speechtokenizer_path, device=device)
# model = ValleInference(use_vocos=False, ar_path=ar_model_path, nar_path=nar_model_path, device='cuda')

In [4]:
# prepare inference data
import librosa
import torch
wav, _ = librosa.load('./egs/tts/VALLE_V2/example.wav', sr=16000)
wav = torch.tensor(wav, dtype=torch.float32)
from IPython.display import Audio
Audio(wav, rate = 16000)

In [5]:
# The transcript of the prompt part
prompt_transcript_text = 'and keeping eternity before the eyes'

# Here are the words you want the model to output
target_transcript_text = 'It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications'
from models.tts.valle_v2.g2p_processor import G2pProcessor
g2p = G2pProcessor()
prompt_transcript = g2p(prompt_transcript_text, 'en')[1]
target_transcript = g2p(target_transcript_text, 'en')[1]

In [6]:
prompt_transcript = torch.tensor(prompt_transcript).long()
target_transcript = torch.tensor(target_transcript).long()
transcript = torch.cat([prompt_transcript, target_transcript], dim=-1)
batch = {
 'speech': wav.unsqueeze(0),
 'phone_ids': transcript.unsqueeze(0),
}

In [7]:
# print the contents of the model input
# `phone_ids` contains a concatenation of `prompt_transcript` and `target_transcript` 
batch

{'speech': tensor([[ 3.0518e-05, 3.0518e-05, 3.0518e-05, ..., -3.0518e-05,
 -3.0518e-05, 3.0518e-05]]),
 'phone_ids': tensor([[ 5, 28, 149, 72, 219, 134, 127, 170, 115, 147, 219, 113, 185, 91,
 149, 30, 185, 123, 219, 65, 115, 106, 43, 172, 219, 73, 29, 219,
 59, 214, 6, 5, 116, 181, 219, 168, 173, 124, 218, 82, 149, 185,
 175, 219, 28, 219, 210, 200, 149, 30, 106, 64, 72, 219, 104, 173,
 100, 143, 209, 94, 135, 219, 73, 24, 181, 219, 116, 214, 219, 113,
 149, 136, 140, 200, 179, 115, 205, 219, 31, 205, 219, 71, 58, 206,
 91, 175, 219, 131, 85, 149, 88, 100, 178, 30, 145, 219, 180, 24,
 179, 136, 175, 219, 28, 149, 72, 219, 141, 15, 76, 30, 140, 214,
 219, 207, 118, 74, 219, 73, 29, 219, 22, 76, 30, 72, 219, 65,
 155, 149, 30, 175, 219, 31, 205, 219, 65, 127, 115, 147, 219, 125,
 218, 30, 140, 123, 219, 83, 136, 179, 185, 82, 149, 76, 30, 67,
 30, 139, 219, 104, 43, 172, 219, 144, 199, 219, 25, 170, 140, 30,
 136, 100, 178, 30, 149, 214, 6]])}

In [8]:
configs = [dict(
 top_p=0.9,
 top_k=5,
 temperature=0.95,
 repeat_penalty=1.0,
 max_length=2000,
 num_beams=1,
)] # model inference hyperparameters
output_wav = model(batch, configs)

In [9]:
output_wav # The output wav is a tensor of shape [1,1,T]

tensor([[[-1.2337e-06, -1.2981e-05, -4.0130e-05, ..., -4.1360e-05,
 1.1917e-05, -4.2949e-05]]])

In [10]:
print(f'prompt_transcript : {prompt_transcript_text}')
print(f'target_transcript : {target_transcript_text}')
Audio(output_wav.squeeze(0), rate = 16000)

prompt_transcript : and keeping eternity before the eyes
target_transcript : It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications


In [11]:
import torchaudio
torchaudio.save('out.wav', output_wav.squeeze(0), 16000)