StormblessedKal commited on
Commit
a81bf6e
·
1 Parent(s): 035ae93
src/__pycache__/predict.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
 
src/__pycache__/se_extractor.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/se_extractor.cpython-310.pyc and b/src/__pycache__/se_extractor.cpython-310.pyc differ
 
src/predict.py CHANGED
@@ -129,6 +129,10 @@ class Predictor:
129
  sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
130
  clamp=False
131
  )
 
 
 
 
132
 
133
 
134
  def predict(self,s3_url,passage,method_type='voice_clone'):
@@ -158,14 +162,86 @@ class Predictor:
158
  result = self.process_audio_file(processed_seg_dir,passage,model,sampler)
159
  final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
160
  sf.write(final_output,result,24000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
 
163
  mp3_final_output_1 = str(final_output).replace('wav','mp3')
 
164
  self.convert_wav_to_mp3(final_output,mp3_final_output_1)
 
165
  print(mp3_final_output_1)
 
 
166
  self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-1.mp3")
167
- return {"voice_clone_1":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-1.mp3"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  def _fn(self,path, solver, nfe, tau):
171
  if path is None:
@@ -322,7 +398,7 @@ class Predictor:
322
  s_prev,
323
  s_ref,
324
  alpha = 0,
325
- beta = 0.3, # make it more suitable for the text
326
  t = 0.7,
327
  diffusion_steps=10, embedding_scale=1)
328
  wavs.append(wav)
 
129
  sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
130
  clamp=False
131
  )
132
+ self.base_speaker_tts = BaseSpeakerTTS(f'{self.ckpt_base}/config.json', device=self.device)
133
+ self.base_speaker_tts.load_ckpt(f'{self.ckpt_base}/checkpoint.pth')
134
+ self.tone_color_converter = ToneColorConverter(f'{self.ckpt_converter}/config.json', device=self.device)
135
+ self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
136
 
137
 
138
  def predict(self,s3_url,passage,method_type='voice_clone'):
 
162
  result = self.process_audio_file(processed_seg_dir,passage,model,sampler)
163
  final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
164
  sf.write(final_output,result,24000)
165
+
166
+ base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
167
+ reference_speaker = local_file_path
168
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
169
+ src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
170
+ openvoice_output = os.path.join(results_dir,f"{gen_id}-2.wav")
171
+ base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
172
+
173
+ source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
174
+ tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
175
+ (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
176
+ denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
177
+ sf.write(denoised_openvoice_output,wav1,new_sr)
178
+
179
 
180
 
181
  mp3_final_output_1 = str(final_output).replace('wav','mp3')
182
+ mp3_final_output_2 = str(denoised_openvoice_output).replace('wav','mp3')
183
  self.convert_wav_to_mp3(final_output,mp3_final_output_1)
184
+ self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_2)
185
  print(mp3_final_output_1)
186
+ print(mp3_final_output_2)
187
+
188
  self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-1.mp3")
189
+ self.upload_file_to_s3(mp3_final_output_2,'demovidelyusergenerations',f"{gen_id}-voice-clone-2.mp3")
190
+ shutil.rmtree(os.path.join(output_dir,gen_id))
191
+ return {"voice_clone_1":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-1.mp3",
192
+ "voice_clone_2":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-2.mp3"
193
+ }
194
+ if method_type == 'voice_clone_with_emotions':
195
+ base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
196
+ reference_speaker = local_file_path
197
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
198
+ src_path = os.path.join(results_dir,f"{gen_id}-tmp-emotions.wav")
199
+ openvoice_output = os.path.join(results_dir,f"{gen_id}-4.wav")
200
+ base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
201
+ source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
202
+ tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
203
+ (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
204
+ denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-with-emotions.wav")
205
+ sf.write(denoised_openvoice_output,wav1,new_sr)
206
+
207
+ mp3_final_output_1 = str(denoised_openvoice_output).replace('wav','mp3')
208
+ self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_1)
209
+ print(mp3_final_output_1)
210
+ self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
211
+ shutil.rmtree(os.path.join(output_dir,gen_id))
212
+ return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
213
+ }
214
 
215
+ if method_type == 'voice_clone_multi_lang':
216
+ #voice clone with multi-lingugal
217
+ _,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
218
+ reference_speaker = local_file_path
219
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
220
+ src_path = 'openai_source_output.mp3'
221
+ source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
222
+
223
+
224
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
225
+ response = client.audio.speech.create(
226
+ model="tts-1",
227
+ voice="fable",
228
+ input=passage
229
+ )
230
+
231
+ openai_multi_lang_path = os.path.join(results_dir,f"{gen_id}-openai-gen.wav")
232
+ response.stream_to_file(openai_multi_lang_path)
233
+ multi_lang_with_voice_clone_path = os.path.join(results_dir,f"{gen_id}-voice-clone-multi-lang.wav")
234
+
235
+ source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
236
+ self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
237
+
238
+ mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
239
+ convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
240
+ print(mp3_final_output_1)
241
+ upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
242
+ return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
243
+ }
244
+
245
 
246
  def _fn(self,path, solver, nfe, tau):
247
  if path is None:
 
398
  s_prev,
399
  s_ref,
400
  alpha = 0,
401
+ beta = 0.1,
402
  t = 0.7,
403
  diffusion_steps=10, embedding_scale=1)
404
  wavs.append(wav)
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/raw/039cf8da-75b8-474d-affa-fc84066c3fa3.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:68f1fdaa436c8072a3d58f8234507be22e12302c77b78ee19b1a911168f96d33
3
- size 3098668
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.mp3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b639c38bce187bdaa36582877934f9896fe87fa4a05731ed2e5f1ce9bf794820
3
- size 1261173
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dd8186db2bab0a68d1b6c432f1226c26b38bafb993a77734b0d79f4b32433c3
3
- size 4954644
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg0.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:312b3f6534a92d88ea1d1fdbe12ded45c29360d602a74864d2787329b4dbeddd
3
- size 774616
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbff348a94a69ed3875f4a89e589b542da96af938248748de0e62b416fe76aa4
3
- size 774704
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg2.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcca412270a4dd74f260396a72772cdfbcdc557a9c66cf20b9bfec2b350778f1
3
- size 774704
 
 
 
 
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg3.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0ee46e6a9974651b2f2f5c350f7ea2a3e3590b5197394292ad31e51b1fc4bca
3
- size 774618
 
 
 
 
src/rp_handler.py CHANGED
@@ -25,7 +25,7 @@ def run_voice_clone_job(job):
25
  assert method_type in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]
26
  s3_url = job_input['s3_url']
27
  passage = job_input['passage']
28
- processed_urls = MODEL.predict(s3_url,passage)
29
 
30
  return processed_urls
31
 
 
25
  assert method_type in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]
26
  s3_url = job_input['s3_url']
27
  passage = job_input['passage']
28
+ processed_urls = MODEL.predict(s3_url,passage,method_type)
29
 
30
  return processed_urls
31