Spaces:
Runtime error
Runtime error
StormblessedKal
commited on
Commit
·
a81bf6e
1
Parent(s):
035ae93
3 apis
Browse files- src/__pycache__/predict.cpython-310.pyc +0 -0
- src/__pycache__/se_extractor.cpython-310.pyc +0 -0
- src/predict.py +78 -2
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/raw/039cf8da-75b8-474d-affa-fc84066c3fa3.wav +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.mp3 +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.wav +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg0.wav +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg1.wav +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg2.wav +0 -3
- src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg3.wav +0 -3
- src/rp_handler.py +1 -1
src/__pycache__/predict.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
|
|
src/__pycache__/se_extractor.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/se_extractor.cpython-310.pyc and b/src/__pycache__/se_extractor.cpython-310.pyc differ
|
|
src/predict.py
CHANGED
@@ -129,6 +129,10 @@ class Predictor:
|
|
129 |
sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
|
130 |
clamp=False
|
131 |
)
|
|
|
|
|
|
|
|
|
132 |
|
133 |
|
134 |
def predict(self,s3_url,passage,method_type='voice_clone'):
|
@@ -158,14 +162,86 @@ class Predictor:
|
|
158 |
result = self.process_audio_file(processed_seg_dir,passage,model,sampler)
|
159 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
160 |
sf.write(final_output,result,24000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
|
163 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
|
|
164 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
|
|
165 |
print(mp3_final_output_1)
|
|
|
|
|
166 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-1.mp3")
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
def _fn(self,path, solver, nfe, tau):
|
171 |
if path is None:
|
@@ -322,7 +398,7 @@ class Predictor:
|
|
322 |
s_prev,
|
323 |
s_ref,
|
324 |
alpha = 0,
|
325 |
-
beta = 0.
|
326 |
t = 0.7,
|
327 |
diffusion_steps=10, embedding_scale=1)
|
328 |
wavs.append(wav)
|
|
|
129 |
sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
|
130 |
clamp=False
|
131 |
)
|
132 |
+
self.base_speaker_tts = BaseSpeakerTTS(f'{self.ckpt_base}/config.json', device=self.device)
|
133 |
+
self.base_speaker_tts.load_ckpt(f'{self.ckpt_base}/checkpoint.pth')
|
134 |
+
self.tone_color_converter = ToneColorConverter(f'{self.ckpt_converter}/config.json', device=self.device)
|
135 |
+
self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
|
136 |
|
137 |
|
138 |
def predict(self,s3_url,passage,method_type='voice_clone'):
|
|
|
162 |
result = self.process_audio_file(processed_seg_dir,passage,model,sampler)
|
163 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
164 |
sf.write(final_output,result,24000)
|
165 |
+
|
166 |
+
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
167 |
+
reference_speaker = local_file_path
|
168 |
+
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
169 |
+
src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
|
170 |
+
openvoice_output = os.path.join(results_dir,f"{gen_id}-2.wav")
|
171 |
+
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
|
172 |
+
|
173 |
+
source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
|
174 |
+
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
175 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
176 |
+
denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
|
177 |
+
sf.write(denoised_openvoice_output,wav1,new_sr)
|
178 |
+
|
179 |
|
180 |
|
181 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
182 |
+
mp3_final_output_2 = str(denoised_openvoice_output).replace('wav','mp3')
|
183 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
184 |
+
self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_2)
|
185 |
print(mp3_final_output_1)
|
186 |
+
print(mp3_final_output_2)
|
187 |
+
|
188 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-1.mp3")
|
189 |
+
self.upload_file_to_s3(mp3_final_output_2,'demovidelyusergenerations',f"{gen_id}-voice-clone-2.mp3")
|
190 |
+
shutil.rmtree(os.path.join(output_dir,gen_id))
|
191 |
+
return {"voice_clone_1":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-1.mp3",
|
192 |
+
"voice_clone_2":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-2.mp3"
|
193 |
+
}
|
194 |
+
if method_type == 'voice_clone_with_emotions':
|
195 |
+
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
196 |
+
reference_speaker = local_file_path
|
197 |
+
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
198 |
+
src_path = os.path.join(results_dir,f"{gen_id}-tmp-emotions.wav")
|
199 |
+
openvoice_output = os.path.join(results_dir,f"{gen_id}-4.wav")
|
200 |
+
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
|
201 |
+
source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
|
202 |
+
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
203 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
204 |
+
denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-with-emotions.wav")
|
205 |
+
sf.write(denoised_openvoice_output,wav1,new_sr)
|
206 |
+
|
207 |
+
mp3_final_output_1 = str(denoised_openvoice_output).replace('wav','mp3')
|
208 |
+
self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_1)
|
209 |
+
print(mp3_final_output_1)
|
210 |
+
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
|
211 |
+
shutil.rmtree(os.path.join(output_dir,gen_id))
|
212 |
+
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
|
213 |
+
}
|
214 |
|
215 |
+
if method_type == 'voice_clone_multi_lang':
|
216 |
+
#voice clone with multi-lingugal
|
217 |
+
_,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
218 |
+
reference_speaker = local_file_path
|
219 |
+
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
220 |
+
src_path = 'openai_source_output.mp3'
|
221 |
+
source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
|
222 |
+
|
223 |
+
|
224 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
225 |
+
response = client.audio.speech.create(
|
226 |
+
model="tts-1",
|
227 |
+
voice="fable",
|
228 |
+
input=passage
|
229 |
+
)
|
230 |
+
|
231 |
+
openai_multi_lang_path = os.path.join(results_dir,f"{gen_id}-openai-gen.wav")
|
232 |
+
response.stream_to_file(openai_multi_lang_path)
|
233 |
+
multi_lang_with_voice_clone_path = os.path.join(results_dir,f"{gen_id}-voice-clone-multi-lang.wav")
|
234 |
+
|
235 |
+
source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
|
236 |
+
self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
|
237 |
+
|
238 |
+
mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
|
239 |
+
convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
|
240 |
+
print(mp3_final_output_1)
|
241 |
+
upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
|
242 |
+
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
|
243 |
+
}
|
244 |
+
|
245 |
|
246 |
def _fn(self,path, solver, nfe, tau):
|
247 |
if path is None:
|
|
|
398 |
s_prev,
|
399 |
s_ref,
|
400 |
alpha = 0,
|
401 |
+
beta = 0.1,
|
402 |
t = 0.7,
|
403 |
diffusion_steps=10, embedding_scale=1)
|
404 |
wavs.append(wav)
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/raw/039cf8da-75b8-474d-affa-fc84066c3fa3.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:68f1fdaa436c8072a3d58f8234507be22e12302c77b78ee19b1a911168f96d33
|
3 |
-
size 3098668
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.mp3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b639c38bce187bdaa36582877934f9896fe87fa4a05731ed2e5f1ce9bf794820
|
3 |
-
size 1261173
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/results/9ac5dfd2-1477-4903-adfc-1cc4d0351977-voice-clone-1.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6dd8186db2bab0a68d1b6c432f1226c26b38bafb993a77734b0d79f4b32433c3
|
3 |
-
size 4954644
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg0.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:312b3f6534a92d88ea1d1fdbe12ded45c29360d602a74864d2787329b4dbeddd
|
3 |
-
size 774616
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg1.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:dbff348a94a69ed3875f4a89e589b542da96af938248748de0e62b416fe76aa4
|
3 |
-
size 774704
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg2.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:dcca412270a4dd74f260396a72772cdfbcdc557a9c66cf20b9bfec2b350778f1
|
3 |
-
size 774704
|
|
|
|
|
|
|
|
src/processed/9ac5dfd2-1477-4903-adfc-1cc4d0351977/segments/039cf8da-75b8-474d-affa-fc84066c3fa3/wavs/039cf8da-75b8-474d-affa-fc84066c3fa3_seg3.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d0ee46e6a9974651b2f2f5c350f7ea2a3e3590b5197394292ad31e51b1fc4bca
|
3 |
-
size 774618
|
|
|
|
|
|
|
|
src/rp_handler.py
CHANGED
@@ -25,7 +25,7 @@ def run_voice_clone_job(job):
|
|
25 |
assert method_type in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]
|
26 |
s3_url = job_input['s3_url']
|
27 |
passage = job_input['passage']
|
28 |
-
processed_urls = MODEL.predict(s3_url,passage)
|
29 |
|
30 |
return processed_urls
|
31 |
|
|
|
25 |
assert method_type in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]
|
26 |
s3_url = job_input['s3_url']
|
27 |
passage = job_input['passage']
|
28 |
+
processed_urls = MODEL.predict(s3_url,passage,method_type)
|
29 |
|
30 |
return processed_urls
|
31 |
|