Spaces:
Running
Running
NeuralFalcon
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from KOKORO.models import build_model
|
2 |
from KOKORO.utils import tts,tts_file_name,podcast
|
3 |
import sys
|
@@ -6,11 +7,25 @@ import os
|
|
6 |
os.system("python download_model.py")
|
7 |
import torch
|
8 |
import gc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
print("Loading model...")
|
10 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
11 |
print(f'Using device: {device}')
|
12 |
-
|
13 |
-
MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
|
14 |
print("Model loaded successfully.")
|
15 |
|
16 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
@@ -22,7 +37,7 @@ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_p
|
|
22 |
|
23 |
|
24 |
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
|
25 |
-
current_model = model_list[
|
26 |
|
27 |
def update_model(model_name):
|
28 |
"""
|
@@ -43,8 +58,21 @@ def update_model(model_name):
|
|
43 |
return f"Model updated to {model_name}"
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
|
|
|
|
48 |
"""
|
49 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
50 |
"""
|
@@ -54,6 +82,12 @@ def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", sp
|
|
54 |
minimum_silence = 0.05
|
55 |
keep_silence = int(minimum_silence * 1000)
|
56 |
save_at = tts_file_name(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
audio_path = tts_maker(
|
58 |
text,
|
59 |
voice_name,
|
@@ -96,7 +130,6 @@ def toggle_autoplay(autoplay):
|
|
96 |
|
97 |
with gr.Blocks() as demo1:
|
98 |
gr.Markdown("# Batched TTS")
|
99 |
-
gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
|
100 |
with gr.Row():
|
101 |
with gr.Column():
|
102 |
text = gr.Textbox(
|
@@ -115,16 +148,17 @@ with gr.Blocks() as demo1:
|
|
115 |
with gr.Row():
|
116 |
generate_btn = gr.Button('Generate', variant='primary')
|
117 |
with gr.Accordion('Audio Settings', open=False):
|
118 |
-
model_name=gr.Dropdown(model_list,label="Model",value=model_list[
|
|
|
|
|
|
|
|
|
119 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
120 |
minimum_silence = gr.Number(
|
121 |
label="Keep Silence Upto (In seconds)",
|
122 |
value=0.05
|
123 |
)
|
124 |
-
|
125 |
-
minimum=0.25, maximum=2, value=1, step=0.1,
|
126 |
-
label='⚡️Speed', info='Adjust the speaking speed'
|
127 |
-
)
|
128 |
# trim = gr.Slider(
|
129 |
# minimum=0, maximum=1, value=0, step=0.1,
|
130 |
# label='🔪 Trim', info='How much to cut from both ends of each segment'
|
@@ -134,6 +168,8 @@ with gr.Blocks() as demo1:
|
|
134 |
label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
|
135 |
)
|
136 |
|
|
|
|
|
137 |
with gr.Column():
|
138 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
139 |
with gr.Accordion('Enable Autoplay', open=False):
|
@@ -142,16 +178,16 @@ with gr.Blocks() as demo1:
|
|
142 |
|
143 |
text.submit(
|
144 |
text_to_speech,
|
145 |
-
inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence],
|
146 |
outputs=[audio]
|
147 |
)
|
148 |
generate_btn.click(
|
149 |
text_to_speech,
|
150 |
-
inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence],
|
151 |
outputs=[audio]
|
152 |
)
|
153 |
|
154 |
-
def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19
|
155 |
global MODEL,device
|
156 |
update_model(model_name)
|
157 |
if not minimum_silence:
|
@@ -258,13 +294,13 @@ def your_tts(text,audio_path,actual_duration,speed=1.0):
|
|
258 |
global srt_voice_name
|
259 |
model_name="kokoro-v0_19.pth"
|
260 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
|
261 |
-
|
262 |
tts_audio = AudioSegment.from_file(tts_path)
|
263 |
tts_duration = len(tts_audio)
|
264 |
if tts_duration > actual_duration:
|
265 |
speedup_factor = tts_duration / actual_duration
|
266 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
|
267 |
-
|
268 |
shutil.copy(tts_path,audio_path)
|
269 |
|
270 |
|
@@ -321,6 +357,77 @@ def clean_srt(input_path):
|
|
321 |
|
322 |
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
|
326 |
class SRTDubbing:
|
@@ -343,14 +450,15 @@ class SRTDubbing:
|
|
343 |
if tts_duration > actual_duration:
|
344 |
speedup_factor = tts_duration / actual_duration
|
345 |
speedup_filename = "./cache/speedup_temp.wav"
|
|
|
346 |
# Use ffmpeg to change audio speed
|
347 |
-
subprocess.run([
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
], check=True)
|
354 |
|
355 |
# Replace the original TTS audio with the sped-up version
|
356 |
shutil.move(speedup_filename, audio_path)
|
@@ -456,10 +564,27 @@ class SRTDubbing:
|
|
456 |
with open("entries.json", "w") as file:
|
457 |
json.dump(entries, file, indent=4)
|
458 |
return entries
|
459 |
-
srt_voice_name="
|
460 |
-
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
srt_dubbing = SRTDubbing()
|
464 |
dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
|
465 |
srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
|
@@ -476,7 +601,7 @@ with gr.Blocks() as demo3:
|
|
476 |
|
477 |
gr.Markdown(
|
478 |
"""
|
479 |
-
# Generate Audio File From Subtitle [
|
480 |
|
481 |
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
482 |
|
@@ -495,7 +620,12 @@ with gr.Blocks() as demo3:
|
|
495 |
)
|
496 |
with gr.Row():
|
497 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
499 |
with gr.Column():
|
500 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
501 |
with gr.Accordion('Enable Autoplay', open=False):
|
@@ -509,24 +639,292 @@ with gr.Blocks() as demo3:
|
|
509 |
# )
|
510 |
generate_btn_.click(
|
511 |
srt_process,
|
512 |
-
inputs=[srt_file,voice],
|
513 |
outputs=[audio]
|
514 |
)
|
515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
display_text = " \n".join(voice_list)
|
518 |
|
519 |
-
with gr.Blocks() as
|
520 |
-
gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
|
521 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
|
|
|
|
522 |
|
523 |
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
def main(debug
|
529 |
-
demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS"
|
530 |
|
531 |
demo.queue().launch(debug=debug, share=share)
|
532 |
#Run on local network
|
@@ -559,4 +957,4 @@ if __name__ == "__main__":
|
|
559 |
|
560 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
561 |
# shutil.move(result, save_at)
|
562 |
-
# print(f"Saved at {save_at}")
|
|
|
1 |
+
|
2 |
from KOKORO.models import build_model
|
3 |
from KOKORO.utils import tts,tts_file_name,podcast
|
4 |
import sys
|
|
|
7 |
os.system("python download_model.py")
|
8 |
import torch
|
9 |
import gc
|
10 |
+
import platform
|
11 |
+
import shutil
|
12 |
+
base_path=os.getcwd()
|
13 |
+
def clean_folder_before_start():
|
14 |
+
global base_path
|
15 |
+
folder_list=["dummy","TTS_DUB","kokoro_audio"]
|
16 |
+
for folder in folder_list:
|
17 |
+
if os.path.exists(f"{base_path}/{folder}"):
|
18 |
+
try:
|
19 |
+
shutil.rmtree(f"{base_path}/{folder}")
|
20 |
+
except:
|
21 |
+
pass
|
22 |
+
os.makedirs(f"{base_path}/{folder}", exist_ok=True)
|
23 |
+
clean_folder_before_start()
|
24 |
+
|
25 |
print("Loading model...")
|
26 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
27 |
print(f'Using device: {device}')
|
28 |
+
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
|
|
|
29 |
print("Model loaded successfully.")
|
30 |
|
31 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
|
|
37 |
|
38 |
|
39 |
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
|
40 |
+
current_model = model_list[0]
|
41 |
|
42 |
def update_model(model_name):
|
43 |
"""
|
|
|
58 |
return f"Model updated to {model_name}"
|
59 |
|
60 |
|
61 |
+
def manage_files(file_path):
|
62 |
+
if os.path.exists(file_path):
|
63 |
+
file_extension = os.path.splitext(file_path)[1] # Get file extension
|
64 |
+
file_size = os.path.getsize(file_path) # Get file size in bytes
|
65 |
+
# Check if file is a valid .pt file and its size is ≤ 5 MB
|
66 |
+
if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
|
67 |
+
return True # File is valid and kept
|
68 |
+
else:
|
69 |
+
os.remove(file_path) # Delete invalid or oversized file
|
70 |
+
return False
|
71 |
+
return False # File does not exist
|
72 |
|
73 |
+
|
74 |
+
|
75 |
+
def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,custom_voicepack=None,trim=0.0):
|
76 |
"""
|
77 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
78 |
"""
|
|
|
82 |
minimum_silence = 0.05
|
83 |
keep_silence = int(minimum_silence * 1000)
|
84 |
save_at = tts_file_name(text)
|
85 |
+
# print(voice_name,custom_voicepack)
|
86 |
+
if custom_voicepack:
|
87 |
+
if manage_files(custom_voicepack):
|
88 |
+
voice_name = custom_voicepack
|
89 |
+
else:
|
90 |
+
gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
|
91 |
audio_path = tts_maker(
|
92 |
text,
|
93 |
voice_name,
|
|
|
130 |
|
131 |
with gr.Blocks() as demo1:
|
132 |
gr.Markdown("# Batched TTS")
|
|
|
133 |
with gr.Row():
|
134 |
with gr.Column():
|
135 |
text = gr.Textbox(
|
|
|
148 |
with gr.Row():
|
149 |
generate_btn = gr.Button('Generate', variant='primary')
|
150 |
with gr.Accordion('Audio Settings', open=False):
|
151 |
+
model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
|
152 |
+
speed = gr.Slider(
|
153 |
+
minimum=0.25, maximum=2, value=1, step=0.1,
|
154 |
+
label='⚡️Speed', info='Adjust the speaking speed'
|
155 |
+
)
|
156 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
157 |
minimum_silence = gr.Number(
|
158 |
label="Keep Silence Upto (In seconds)",
|
159 |
value=0.05
|
160 |
)
|
161 |
+
|
|
|
|
|
|
|
162 |
# trim = gr.Slider(
|
163 |
# minimum=0, maximum=1, value=0, step=0.1,
|
164 |
# label='🔪 Trim', info='How much to cut from both ends of each segment'
|
|
|
168 |
label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
|
169 |
)
|
170 |
|
171 |
+
custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
|
172 |
+
|
173 |
with gr.Column():
|
174 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
175 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
|
178 |
|
179 |
text.submit(
|
180 |
text_to_speech,
|
181 |
+
inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
|
182 |
outputs=[audio]
|
183 |
)
|
184 |
generate_btn.click(
|
185 |
text_to_speech,
|
186 |
+
inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
|
187 |
outputs=[audio]
|
188 |
)
|
189 |
|
190 |
+
def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
|
191 |
global MODEL,device
|
192 |
update_model(model_name)
|
193 |
if not minimum_silence:
|
|
|
294 |
global srt_voice_name
|
295 |
model_name="kokoro-v0_19.pth"
|
296 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
|
297 |
+
# print(tts_path)
|
298 |
tts_audio = AudioSegment.from_file(tts_path)
|
299 |
tts_duration = len(tts_audio)
|
300 |
if tts_duration > actual_duration:
|
301 |
speedup_factor = tts_duration / actual_duration
|
302 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
|
303 |
+
# print(tts_path)
|
304 |
shutil.copy(tts_path,audio_path)
|
305 |
|
306 |
|
|
|
357 |
|
358 |
|
359 |
|
360 |
+
import librosa
|
361 |
+
import soundfile as sf
|
362 |
+
import subprocess
|
363 |
+
|
364 |
+
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
365 |
+
try:
|
366 |
+
# Load the audio file
|
367 |
+
y, sr = librosa.load(input_file, sr=None)
|
368 |
+
|
369 |
+
# Use time stretching to speed up audio without changing pitch
|
370 |
+
y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
|
371 |
+
|
372 |
+
# Save the output with the original sample rate
|
373 |
+
sf.write(output_file, y_stretched, sr)
|
374 |
+
# print(f"Speed up by {speedup_factor} completed successfully: {output_file}")
|
375 |
+
|
376 |
+
except Exception as e:
|
377 |
+
gr.Warning(f"Error during speedup with Librosa: {e}")
|
378 |
+
shutil.copy(input_file, output_file)
|
379 |
+
|
380 |
+
|
381 |
+
|
382 |
+
|
383 |
+
def is_ffmpeg_installed():
|
384 |
+
if platform.system() == "Windows":
|
385 |
+
local_ffmpeg_path = os.path.join("./ffmpeg", "ffmpeg.exe")
|
386 |
+
else:
|
387 |
+
local_ffmpeg_path = "ffmpeg"
|
388 |
+
try:
|
389 |
+
subprocess.run([local_ffmpeg_path, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
|
390 |
+
# print("FFmpeg is installed")
|
391 |
+
return True,local_ffmpeg_path
|
392 |
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
393 |
+
# print("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing")
|
394 |
+
gr.Warning("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing",duration= 20)
|
395 |
+
return False,local_ffmpeg_path
|
396 |
+
|
397 |
+
|
398 |
+
|
399 |
+
|
400 |
+
# ffmpeg -i test.wav -filter:a "atempo=2.0" ffmpeg.wav -y
|
401 |
+
def change_speed(input_file, output_file, speedup_factor):
|
402 |
+
global use_ffmpeg,local_ffmpeg_path
|
403 |
+
if use_ffmpeg:
|
404 |
+
# print("Using FFmpeg for speedup")
|
405 |
+
try:
|
406 |
+
# subprocess.run([
|
407 |
+
# local_ffmpeg_path,
|
408 |
+
# "-i", input_file,
|
409 |
+
# "-filter:a", f"atempo={speedup_factor}",
|
410 |
+
# output_file,
|
411 |
+
# "-y"
|
412 |
+
# ], check=True)
|
413 |
+
subprocess.run([
|
414 |
+
local_ffmpeg_path,
|
415 |
+
"-i", input_file,
|
416 |
+
"-filter:a", f"atempo={speedup_factor}",
|
417 |
+
output_file,
|
418 |
+
"-y"
|
419 |
+
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
420 |
+
except Exception as e:
|
421 |
+
gr.Error(f"Error during speedup with FFmpeg: {e}")
|
422 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
423 |
+
else:
|
424 |
+
# print("Using Librosa for speedup")
|
425 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
426 |
+
|
427 |
+
|
428 |
+
|
429 |
+
|
430 |
+
|
431 |
|
432 |
|
433 |
class SRTDubbing:
|
|
|
450 |
if tts_duration > actual_duration:
|
451 |
speedup_factor = tts_duration / actual_duration
|
452 |
speedup_filename = "./cache/speedup_temp.wav"
|
453 |
+
change_speed(tts_filename, speedup_filename, speedup_factor)
|
454 |
# Use ffmpeg to change audio speed
|
455 |
+
# subprocess.run([
|
456 |
+
# "ffmpeg",
|
457 |
+
# "-i", tts_filename,
|
458 |
+
# "-filter:a", f"atempo={speedup_factor}",
|
459 |
+
# speedup_filename,
|
460 |
+
# "-y"
|
461 |
+
# ], check=True)
|
462 |
|
463 |
# Replace the original TTS audio with the sped-up version
|
464 |
shutil.move(speedup_filename, audio_path)
|
|
|
564 |
with open("entries.json", "w") as file:
|
565 |
json.dump(entries, file, indent=4)
|
566 |
return entries
|
567 |
+
srt_voice_name="af"
|
568 |
+
use_ffmpeg,local_ffmpeg_path = is_ffmpeg_installed()
|
569 |
+
# use_ffmpeg=False
|
570 |
+
|
571 |
+
def srt_process(srt_file_path,voice_name,custom_voicepack=None,dest_language="en"):
|
572 |
+
global srt_voice_name,use_ffmpeg
|
573 |
+
|
574 |
+
if not srt_file_path.endswith(".srt"):
|
575 |
+
gr.Error("Please upload a valid .srt file",duration=5)
|
576 |
+
return None
|
577 |
+
if use_ffmpeg:
|
578 |
+
gr.Success("Using FFmpeg for audio speedup to sync with subtitle")
|
579 |
+
else:
|
580 |
+
gr.Warning("Install FFmpeg to ensure high-quality audio when speeding up the audio to sync with subtitle. Default Using 'librosa' for speedup",duration= 20)
|
581 |
+
|
582 |
+
if custom_voicepack:
|
583 |
+
if manage_files(custom_voicepack):
|
584 |
+
srt_voice_name = custom_voicepack
|
585 |
+
else:
|
586 |
+
srt_voice_name=voice_name
|
587 |
+
gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
|
588 |
srt_dubbing = SRTDubbing()
|
589 |
dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
|
590 |
srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
|
|
|
601 |
|
602 |
gr.Markdown(
|
603 |
"""
|
604 |
+
# Generate Audio File From Subtitle [Upload Only .srt file]
|
605 |
|
606 |
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
607 |
|
|
|
620 |
)
|
621 |
with gr.Row():
|
622 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
623 |
+
|
624 |
+
with gr.Accordion('Audio Settings', open=False):
|
625 |
+
custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
|
626 |
+
|
627 |
+
|
628 |
+
|
629 |
with gr.Column():
|
630 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
631 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
|
639 |
# )
|
640 |
generate_btn_.click(
|
641 |
srt_process,
|
642 |
+
inputs=[srt_file,voice,custom_voicepack],
|
643 |
outputs=[audio]
|
644 |
)
|
645 |
|
646 |
+
|
647 |
+
|
648 |
+
#### Voice mixing
|
649 |
+
# modified from here
|
650 |
+
# https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS
|
651 |
+
def get_voices():
|
652 |
+
voices = {}
|
653 |
+
for i in os.listdir("./KOKORO/voices"):
|
654 |
+
if i.endswith(".pt"):
|
655 |
+
voice_name = i.replace(".pt", "")
|
656 |
+
voices[voice_name] = torch.load(f"./KOKORO/voices/{i}", weights_only=True).to(device)
|
657 |
+
|
658 |
+
slider_configs = {}
|
659 |
+
|
660 |
+
# Iterate through the predefined list of voices
|
661 |
+
for i in voices:
|
662 |
+
# Handle the default case for "af"
|
663 |
+
if i == "af":
|
664 |
+
slider_configs["af"]= "Default 👩🇺🇸"
|
665 |
+
continue
|
666 |
+
if i == "af_nicole":
|
667 |
+
slider_configs["af_nicole"]="Nicole 😏🇺🇸"
|
668 |
+
continue
|
669 |
+
if i == "af_bella":
|
670 |
+
slider_configs["af_bella"]="Bella 🤗🇺🇸"
|
671 |
+
continue
|
672 |
+
|
673 |
+
# Determine the country emoji
|
674 |
+
country = "🇺🇸" if i.startswith("a") else "🇬🇧"
|
675 |
+
|
676 |
+
# Determine the gender emoji and name
|
677 |
+
if "f_" in i:
|
678 |
+
display_name = f"{i.split('_')[-1].capitalize()} 👩{country}"
|
679 |
+
elif "m_" in i:
|
680 |
+
display_name = f"{i.split('_')[-1].capitalize()} 👨{country}"
|
681 |
+
else:
|
682 |
+
display_name = f"{i.capitalize()} 😐"
|
683 |
+
|
684 |
+
# Append the voice tuple to the list
|
685 |
+
slider_configs[i]= display_name
|
686 |
+
|
687 |
+
return voices, slider_configs
|
688 |
+
|
689 |
+
voices, slider_configs = get_voices()
|
690 |
+
|
691 |
+
|
692 |
+
def parse_voice_formula(formula):
|
693 |
+
global voices
|
694 |
+
"""Parse the voice formula string and return the combined voice tensor."""
|
695 |
+
if not formula.strip():
|
696 |
+
raise ValueError("Empty voice formula")
|
697 |
+
|
698 |
+
# Initialize the weighted sum
|
699 |
+
weighted_sum = None
|
700 |
+
|
701 |
+
# Split the formula into terms
|
702 |
+
terms = formula.split('+')
|
703 |
+
weights=0
|
704 |
+
for term in terms:
|
705 |
+
# Parse each term (format: "voice_name * 0.333")
|
706 |
+
parts = term.strip().split('*')
|
707 |
+
if len(parts) != 2:
|
708 |
+
raise ValueError(f"Invalid term format: {term.strip()}. Should be 'voice_name * weight'")
|
709 |
+
|
710 |
+
voice_name = parts[0].strip()
|
711 |
+
weight = float(parts[1].strip())
|
712 |
+
weights+=weight
|
713 |
+
# print(voice_name)
|
714 |
+
# print(weight)
|
715 |
+
# Get the voice tensor
|
716 |
+
if voice_name not in voices:
|
717 |
+
raise ValueError(f"Unknown voice: {voice_name}")
|
718 |
+
|
719 |
+
voice_tensor = voices[voice_name]
|
720 |
+
|
721 |
+
# Add to weighted sum
|
722 |
+
if weighted_sum is None:
|
723 |
+
weighted_sum = weight * voice_tensor
|
724 |
+
else:
|
725 |
+
weighted_sum += weight * voice_tensor
|
726 |
+
return weighted_sum/weights
|
727 |
+
|
728 |
+
|
729 |
+
|
730 |
+
|
731 |
+
|
732 |
+
|
733 |
+
|
734 |
+
def get_new_voice(formula):
|
735 |
+
# print(formula)
|
736 |
+
try:
|
737 |
+
# Parse the formula and get the combined voice tensor
|
738 |
+
weighted_voices = parse_voice_formula(formula)
|
739 |
+
voice_pack_name = "./weighted_normalised_voices.pt"
|
740 |
+
# Save and load the combined voice
|
741 |
+
torch.save(weighted_voices, voice_pack_name)
|
742 |
+
# print(f"Voice pack saved at: {voice_pack_name}")
|
743 |
+
return voice_pack_name
|
744 |
+
except Exception as e:
|
745 |
+
raise gr.Error(f"Failed to create voice: {str(e)}")
|
746 |
+
|
747 |
+
|
748 |
+
def generate_voice_formula(*values):
|
749 |
+
"""
|
750 |
+
Generate a formatted string showing the normalized voice combination.
|
751 |
+
Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
|
752 |
+
"""
|
753 |
+
n = len(values) // 2
|
754 |
+
checkbox_values = values[:n]
|
755 |
+
slider_values = list(values[n:])
|
756 |
+
global slider_configs
|
757 |
+
# Get active sliders and their names
|
758 |
+
active_pairs = [(slider_values[i], slider_configs[i][0])
|
759 |
+
for i in range(len(slider_configs))
|
760 |
+
if checkbox_values[i]]
|
761 |
+
|
762 |
+
if not active_pairs:
|
763 |
+
return ""
|
764 |
+
|
765 |
+
# If only one voice is selected, use its actual value
|
766 |
+
if len(active_pairs) == 1:
|
767 |
+
value, name = active_pairs[0]
|
768 |
+
return f"{value:.3f} * {name}"
|
769 |
+
|
770 |
+
# Calculate sum for normalization of multiple voices
|
771 |
+
total_sum = sum(value for value, _ in active_pairs)
|
772 |
+
|
773 |
+
if total_sum == 0:
|
774 |
+
return ""
|
775 |
+
|
776 |
+
# Generate normalized formula for multiple voices
|
777 |
+
terms = []
|
778 |
+
for value, name in active_pairs:
|
779 |
+
normalized_value = value / total_sum
|
780 |
+
terms.append(f"{normalized_value:.3f} * {name}")
|
781 |
+
|
782 |
+
return " + ".join(terms)
|
783 |
|
784 |
+
|
785 |
+
|
786 |
+
|
787 |
+
|
788 |
+
def create_voice_mix_ui():
|
789 |
+
with gr.Blocks() as demo:
|
790 |
+
gr.Markdown(
|
791 |
+
"""
|
792 |
+
# Kokoro Voice Mixer
|
793 |
+
Select voices and adjust their weights to create a mixed voice.
|
794 |
+
"""
|
795 |
+
)
|
796 |
+
|
797 |
+
voice_components = {}
|
798 |
+
voice_names = list(voices.keys())
|
799 |
+
female_voices = [name for name in voice_names if "f_" in name]
|
800 |
+
male_voices = [name for name in voice_names if "b_" in name]
|
801 |
+
neutral_voices = [name for name in voice_names if "f_" not in name and "b_" not in name]
|
802 |
+
|
803 |
+
# Define how many columns you want
|
804 |
+
num_columns = 3
|
805 |
+
|
806 |
+
# Function to generate UI
|
807 |
+
def generate_ui_row(voice_list):
|
808 |
+
num_voices = len(voice_list)
|
809 |
+
num_rows = (num_voices + num_columns - 1) // num_columns
|
810 |
+
for i in range(num_rows):
|
811 |
+
with gr.Row():
|
812 |
+
for j in range(num_columns):
|
813 |
+
index = i * num_columns + j
|
814 |
+
if index < num_voices:
|
815 |
+
voice_name = voice_list[index]
|
816 |
+
with gr.Column():
|
817 |
+
checkbox = gr.Checkbox(label=slider_configs[voice_name])
|
818 |
+
weight_slider = gr.Slider(
|
819 |
+
minimum=0,
|
820 |
+
maximum=1,
|
821 |
+
value=1.0,
|
822 |
+
step=0.01,
|
823 |
+
interactive=False
|
824 |
+
)
|
825 |
+
voice_components[voice_name] = (checkbox, weight_slider)
|
826 |
+
checkbox.change(
|
827 |
+
lambda x, slider=weight_slider: gr.update(interactive=x),
|
828 |
+
inputs=[checkbox],
|
829 |
+
outputs=[weight_slider]
|
830 |
+
)
|
831 |
+
|
832 |
+
generate_ui_row(female_voices)
|
833 |
+
generate_ui_row(male_voices)
|
834 |
+
generate_ui_row(neutral_voices)
|
835 |
+
|
836 |
+
formula_inputs = []
|
837 |
+
for i in voice_components:
|
838 |
+
checkbox, slider = voice_components[i]
|
839 |
+
formula_inputs.append(checkbox)
|
840 |
+
formula_inputs.append(slider)
|
841 |
+
|
842 |
+
with gr.Row():
|
843 |
+
voice_formula = gr.Textbox(label="Voice Formula", interactive=False)
|
844 |
+
|
845 |
+
# Function to dynamically update the voice formula
|
846 |
+
def update_voice_formula(*args):
|
847 |
+
formula_parts = []
|
848 |
+
for i, (checkbox, slider) in enumerate(voice_components.values()):
|
849 |
+
if args[i * 2]: # If checkbox is selected
|
850 |
+
formula_parts.append(f"{list(voice_components.keys())[i]} * {args[i * 2 + 1]:.3f}")
|
851 |
+
return " + ".join(formula_parts)
|
852 |
+
|
853 |
+
|
854 |
+
# Update formula whenever any checkbox or slider changes
|
855 |
+
for checkbox, slider in voice_components.values():
|
856 |
+
checkbox.change(
|
857 |
+
update_voice_formula,
|
858 |
+
inputs=formula_inputs,
|
859 |
+
outputs=[voice_formula]
|
860 |
+
)
|
861 |
+
slider.change(
|
862 |
+
update_voice_formula,
|
863 |
+
inputs=formula_inputs,
|
864 |
+
outputs=[voice_formula]
|
865 |
+
)
|
866 |
+
|
867 |
+
with gr.Row():
|
868 |
+
voice_text = gr.Textbox(
|
869 |
+
label='Enter Text',
|
870 |
+
lines=3,
|
871 |
+
placeholder="Type your text here to preview the custom voice..."
|
872 |
+
)
|
873 |
+
voice_generator = gr.Button('Generate', variant='primary')
|
874 |
+
with gr.Accordion('Audio Settings', open=False):
|
875 |
+
model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
|
876 |
+
speed = gr.Slider(
|
877 |
+
minimum=0.25, maximum=2, value=1, step=0.1,
|
878 |
+
label='⚡️Speed', info='Adjust the speaking speed'
|
879 |
+
)
|
880 |
+
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
881 |
+
with gr.Row():
|
882 |
+
voice_audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
883 |
+
with gr.Row():
|
884 |
+
mix_voice_download = gr.File(label="Download VoicePack")
|
885 |
+
with gr.Accordion('Enable Autoplay', open=False):
|
886 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
887 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[voice_audio])
|
888 |
+
def generate_custom_audio(text_input, formula_text, model_name, speed, remove_silence):
|
889 |
+
try:
|
890 |
+
new_voice_pack = get_new_voice(formula_text)
|
891 |
+
audio_output_path =text_to_speech(text=text_input, model_name=model_name, voice_name="af", speed=speed, pad_between_segments=0, remove_silence=remove_silence, minimum_silence=0.05,custom_voicepack=new_voice_pack,trim=0.0)
|
892 |
+
# audio_output_path = text_to_speech(text=text_input, model_name=model_name,voice_name="af", speed=1.0, custom_voicepack=new_voice_pack)
|
893 |
+
return audio_output_path,new_voice_pack
|
894 |
+
except Exception as e:
|
895 |
+
raise gr.Error(f"Failed to generate audio: {e}")
|
896 |
+
|
897 |
+
|
898 |
+
voice_generator.click(
|
899 |
+
generate_custom_audio,
|
900 |
+
inputs=[voice_text, voice_formula,model_name,speed,remove_silence],
|
901 |
+
outputs=[voice_audio,mix_voice_download]
|
902 |
+
)
|
903 |
+
return demo
|
904 |
+
|
905 |
+
demo4 = create_voice_mix_ui()
|
906 |
+
|
907 |
+
|
908 |
+
|
909 |
+
|
910 |
+
|
911 |
+
|
912 |
+
|
913 |
+
|
914 |
display_text = " \n".join(voice_list)
|
915 |
|
916 |
+
with gr.Blocks() as demo5:
|
|
|
917 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
918 |
+
|
919 |
+
|
920 |
|
921 |
|
922 |
+
import click
|
923 |
+
@click.command()
|
924 |
+
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
925 |
+
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
926 |
+
def main(debug, share):
|
927 |
+
demo = gr.TabbedInterface([demo1, demo2,demo3,demo4,demo5], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Voice Mix","Available Voice Names"],title="Kokoro TTS")
|
928 |
|
929 |
demo.queue().launch(debug=debug, share=share)
|
930 |
#Run on local network
|
|
|
957 |
|
958 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
959 |
# shutil.move(result, save_at)
|
960 |
+
# print(f"Saved at {save_at}")
|