Spaces:
Running
Running
import torch | |
import torchaudio | |
import numpy as np | |
from decoder_base import AcousticModel | |
class InferencePipeline(): | |
def __init__(self): | |
# download hubert content encoder | |
self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda() | |
# initialize decoder with checkpoint | |
ckpts_path = 'model-best.pt' | |
self.model = AcousticModel() | |
cp = torch.load(ckpts_path, map_location=torch.device('cpu')) | |
self.model.load_state_dict(cp['acoustic-model']) | |
# download vocoder | |
self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu')) | |
# load source audio | |
#self.source, sr = torchaudio.load("test.wav") | |
#self.source = torchaudio.functional.resample(self.source, sr, 16000) | |
#self.source = self.source.unsqueeze(0)#.cuda() | |
# load target speaker embedding | |
self.trg_spk_emb = np.load('p225_007_mic1.npy') | |
self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb) | |
self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda() | |
def voice_conversion(self, audio_file_path): | |
# run inference | |
self.model.eval() | |
with torch.inference_mode(): | |
# Extract speech units | |
units = self.hubert.units(audio_file_path) | |
# Generate target spectrogram | |
mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2) | |
# Generate audio waveform | |
target = self.hifigan(mel) | |
# Assuming `target` is a tensor with the audio waveform | |
# Convert it to numpy array and save it as an output audio file | |
output_audio_path = "output.wav" | |
torchaudio.save(output_audio_path, target.cpu(), sample_rate=16000) | |
return output_audio_path | |
#torchaudio.save("output.wav", target.squeeze(0), 16000) |