uzdzn commited on
Commit
15564d2
1 Parent(s): 33079ce

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +9 -11
inference.py CHANGED
@@ -17,22 +17,22 @@ class InferencePipeline():
17
  # download vocoder
18
  self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))
19
 
20
- # load source audio
21
- #self.source, sr = torchaudio.load("test.wav")
22
- #self.source = torchaudio.functional.resample(self.source, sr, 16000)
23
- #self.source = self.source.unsqueeze(0)#.cuda()
24
-
25
  # load target speaker embedding
26
  self.trg_spk_emb = np.load('p225_007_mic1.npy')
27
  self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
28
  self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()
29
 
30
- def voice_conversion(self, audio_file_path):
 
 
 
 
 
31
  # run inference
32
  self.model.eval()
33
  with torch.inference_mode():
34
  # Extract speech units
35
- units = self.hubert.units(audio_file_path)
36
  # Generate target spectrogram
37
  mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
38
  # Generate audio waveform
@@ -41,8 +41,6 @@ class InferencePipeline():
41
  # Assuming `target` is a tensor with the audio waveform
42
  # Convert it to numpy array and save it as an output audio file
43
  output_audio_path = "output.wav"
44
- torchaudio.save(output_audio_path, target.cpu(), sample_rate=16000)
45
-
46
- return output_audio_path
47
 
48
- #torchaudio.save("output.wav", target.squeeze(0), 16000)
 
17
  # download vocoder
18
  self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))
19
 
 
 
 
 
 
20
  # load target speaker embedding
21
  self.trg_spk_emb = np.load('p225_007_mic1.npy')
22
  self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
23
  self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()
24
 
25
+ def voice_conversion(self, audio_path):
26
+ # load source audio
27
+ source, sr = torchaudio.load(audio_path) #"test.wav")
28
+ source = torchaudio.functional.resample(source, sr, 16000)
29
+ source = source.unsqueeze(0)#.cuda()
30
+
31
  # run inference
32
  self.model.eval()
33
  with torch.inference_mode():
34
  # Extract speech units
35
+ units = self.hubert.units(source)
36
  # Generate target spectrogram
37
  mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
38
  # Generate audio waveform
 
41
  # Assuming `target` is a tensor with the audio waveform
42
  # Convert it to numpy array and save it as an output audio file
43
  output_audio_path = "output.wav"
44
+ torchaudio.save("output.wav", target.squeeze(0), 16000)
 
 
45
 
46
+ return output_audio_path