NeuralFalcon commited on
Commit
3a5f7f4
·
verified ·
1 Parent(s): 6b16783

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +436 -38
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from KOKORO.models import build_model
2
  from KOKORO.utils import tts,tts_file_name,podcast
3
  import sys
@@ -6,11 +7,25 @@ import os
6
  os.system("python download_model.py")
7
  import torch
8
  import gc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  print("Loading model...")
10
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  print(f'Using device: {device}')
12
- # MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
13
- MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
14
  print("Model loaded successfully.")
15
 
16
  def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
@@ -22,7 +37,7 @@ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_p
22
 
23
 
24
  model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
25
- current_model = model_list[-1]
26
 
27
  def update_model(model_name):
28
  """
@@ -43,8 +58,21 @@ def update_model(model_name):
43
  return f"Model updated to {model_name}"
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,trim=0.5):
 
 
48
  """
49
  Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
50
  """
@@ -54,6 +82,12 @@ def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", sp
54
  minimum_silence = 0.05
55
  keep_silence = int(minimum_silence * 1000)
56
  save_at = tts_file_name(text)
 
 
 
 
 
 
57
  audio_path = tts_maker(
58
  text,
59
  voice_name,
@@ -96,7 +130,6 @@ def toggle_autoplay(autoplay):
96
 
97
  with gr.Blocks() as demo1:
98
  gr.Markdown("# Batched TTS")
99
- gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
100
  with gr.Row():
101
  with gr.Column():
102
  text = gr.Textbox(
@@ -115,16 +148,17 @@ with gr.Blocks() as demo1:
115
  with gr.Row():
116
  generate_btn = gr.Button('Generate', variant='primary')
117
  with gr.Accordion('Audio Settings', open=False):
118
- model_name=gr.Dropdown(model_list,label="Model",value=model_list[-1])
 
 
 
 
119
  remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
120
  minimum_silence = gr.Number(
121
  label="Keep Silence Upto (In seconds)",
122
  value=0.05
123
  )
124
- speed = gr.Slider(
125
- minimum=0.25, maximum=2, value=1, step=0.1,
126
- label='⚡️Speed', info='Adjust the speaking speed'
127
- )
128
  # trim = gr.Slider(
129
  # minimum=0, maximum=1, value=0, step=0.1,
130
  # label='🔪 Trim', info='How much to cut from both ends of each segment'
@@ -134,6 +168,8 @@ with gr.Blocks() as demo1:
134
  label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
135
  )
136
 
 
 
137
  with gr.Column():
138
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
139
  with gr.Accordion('Enable Autoplay', open=False):
@@ -142,16 +178,16 @@ with gr.Blocks() as demo1:
142
 
143
  text.submit(
144
  text_to_speech,
145
- inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence],
146
  outputs=[audio]
147
  )
148
  generate_btn.click(
149
  text_to_speech,
150
- inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence],
151
  outputs=[audio]
152
  )
153
 
154
- def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19-half.pth"):
155
  global MODEL,device
156
  update_model(model_name)
157
  if not minimum_silence:
@@ -258,13 +294,13 @@ def your_tts(text,audio_path,actual_duration,speed=1.0):
258
  global srt_voice_name
259
  model_name="kokoro-v0_19.pth"
260
  tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
261
- print(tts_path)
262
  tts_audio = AudioSegment.from_file(tts_path)
263
  tts_duration = len(tts_audio)
264
  if tts_duration > actual_duration:
265
  speedup_factor = tts_duration / actual_duration
266
  tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
267
- print(tts_path)
268
  shutil.copy(tts_path,audio_path)
269
 
270
 
@@ -321,6 +357,77 @@ def clean_srt(input_path):
321
 
322
 
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
 
326
  class SRTDubbing:
@@ -343,14 +450,15 @@ class SRTDubbing:
343
  if tts_duration > actual_duration:
344
  speedup_factor = tts_duration / actual_duration
345
  speedup_filename = "./cache/speedup_temp.wav"
 
346
  # Use ffmpeg to change audio speed
347
- subprocess.run([
348
- "ffmpeg",
349
- "-i", tts_filename,
350
- "-filter:a", f"atempo={speedup_factor}",
351
- speedup_filename,
352
- "-y"
353
- ], check=True)
354
 
355
  # Replace the original TTS audio with the sped-up version
356
  shutil.move(speedup_filename, audio_path)
@@ -456,10 +564,27 @@ class SRTDubbing:
456
  with open("entries.json", "w") as file:
457
  json.dump(entries, file, indent=4)
458
  return entries
459
- srt_voice_name="am_adam"
460
- def srt_process(srt_file_path,voice_name,dest_language="en"):
461
- global srt_voice_name
462
- srt_voice_name=voice_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  srt_dubbing = SRTDubbing()
464
  dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
465
  srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
@@ -476,7 +601,7 @@ with gr.Blocks() as demo3:
476
 
477
  gr.Markdown(
478
  """
479
- # Generate Audio File From Subtitle [Single Speaker Only]
480
 
481
  To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
482
 
@@ -495,7 +620,12 @@ with gr.Blocks() as demo3:
495
  )
496
  with gr.Row():
497
  generate_btn_ = gr.Button('Generate', variant='primary')
498
-
 
 
 
 
 
499
  with gr.Column():
500
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
501
  with gr.Accordion('Enable Autoplay', open=False):
@@ -509,24 +639,292 @@ with gr.Blocks() as demo3:
509
  # )
510
  generate_btn_.click(
511
  srt_process,
512
- inputs=[srt_file,voice],
513
  outputs=[audio]
514
  )
515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  display_text = " \n".join(voice_list)
518
 
519
- with gr.Blocks() as demo4:
520
- gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
521
  gr.Markdown(f"# Voice Names \n{display_text}")
 
 
522
 
523
 
524
- # import click
525
- # @click.command()
526
- # @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
527
- # @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
528
- def main(debug=False, share=False):
529
- demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS",theme='JohnSmith9982/small_and_pretty')
530
 
531
  demo.queue().launch(debug=debug, share=share)
532
  #Run on local network
@@ -559,4 +957,4 @@ if __name__ == "__main__":
559
 
560
  # save_at=f"./temp_audio/{os.path.basename(result)}"
561
  # shutil.move(result, save_at)
562
- # print(f"Saved at {save_at}")
 
1
+
2
  from KOKORO.models import build_model
3
  from KOKORO.utils import tts,tts_file_name,podcast
4
  import sys
 
7
  os.system("python download_model.py")
8
  import torch
9
  import gc
10
+ import platform
11
+ import shutil
12
+ base_path=os.getcwd()
13
+ def clean_folder_before_start():
14
+ global base_path
15
+ folder_list=["dummy","TTS_DUB","kokoro_audio"]
16
+ for folder in folder_list:
17
+ if os.path.exists(f"{base_path}/{folder}"):
18
+ try:
19
+ shutil.rmtree(f"{base_path}/{folder}")
20
+ except:
21
+ pass
22
+ os.makedirs(f"{base_path}/{folder}", exist_ok=True)
23
+ clean_folder_before_start()
24
+
25
  print("Loading model...")
26
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
27
  print(f'Using device: {device}')
28
+ MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
 
29
  print("Model loaded successfully.")
30
 
31
  def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
 
37
 
38
 
39
  model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
40
+ current_model = model_list[0]
41
 
42
  def update_model(model_name):
43
  """
 
58
  return f"Model updated to {model_name}"
59
 
60
 
61
+ def manage_files(file_path):
62
+ if os.path.exists(file_path):
63
+ file_extension = os.path.splitext(file_path)[1] # Get file extension
64
+ file_size = os.path.getsize(file_path) # Get file size in bytes
65
+ # Check if file is a valid .pt file and its size is ≤ 5 MB
66
+ if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
67
+ return True # File is valid and kept
68
+ else:
69
+ os.remove(file_path) # Delete invalid or oversized file
70
+ return False
71
+ return False # File does not exist
72
 
73
+
74
+
75
+ def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,custom_voicepack=None,trim=0.0):
76
  """
77
  Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
78
  """
 
82
  minimum_silence = 0.05
83
  keep_silence = int(minimum_silence * 1000)
84
  save_at = tts_file_name(text)
85
+ # print(voice_name,custom_voicepack)
86
+ if custom_voicepack:
87
+ if manage_files(custom_voicepack):
88
+ voice_name = custom_voicepack
89
+ else:
90
+ gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
91
  audio_path = tts_maker(
92
  text,
93
  voice_name,
 
130
 
131
  with gr.Blocks() as demo1:
132
  gr.Markdown("# Batched TTS")
 
133
  with gr.Row():
134
  with gr.Column():
135
  text = gr.Textbox(
 
148
  with gr.Row():
149
  generate_btn = gr.Button('Generate', variant='primary')
150
  with gr.Accordion('Audio Settings', open=False):
151
+ model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
152
+ speed = gr.Slider(
153
+ minimum=0.25, maximum=2, value=1, step=0.1,
154
+ label='⚡️Speed', info='Adjust the speaking speed'
155
+ )
156
  remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
157
  minimum_silence = gr.Number(
158
  label="Keep Silence Upto (In seconds)",
159
  value=0.05
160
  )
161
+
 
 
 
162
  # trim = gr.Slider(
163
  # minimum=0, maximum=1, value=0, step=0.1,
164
  # label='🔪 Trim', info='How much to cut from both ends of each segment'
 
168
  label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
169
  )
170
 
171
+ custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
172
+
173
  with gr.Column():
174
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
175
  with gr.Accordion('Enable Autoplay', open=False):
 
178
 
179
  text.submit(
180
  text_to_speech,
181
+ inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
182
  outputs=[audio]
183
  )
184
  generate_btn.click(
185
  text_to_speech,
186
+ inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
187
  outputs=[audio]
188
  )
189
 
190
+ def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
191
  global MODEL,device
192
  update_model(model_name)
193
  if not minimum_silence:
 
294
  global srt_voice_name
295
  model_name="kokoro-v0_19.pth"
296
  tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
297
+ # print(tts_path)
298
  tts_audio = AudioSegment.from_file(tts_path)
299
  tts_duration = len(tts_audio)
300
  if tts_duration > actual_duration:
301
  speedup_factor = tts_duration / actual_duration
302
  tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
303
+ # print(tts_path)
304
  shutil.copy(tts_path,audio_path)
305
 
306
 
 
357
 
358
 
359
 
360
+ import librosa
361
+ import soundfile as sf
362
+ import subprocess
363
+
364
+ def speedup_audio_librosa(input_file, output_file, speedup_factor):
365
+ try:
366
+ # Load the audio file
367
+ y, sr = librosa.load(input_file, sr=None)
368
+
369
+ # Use time stretching to speed up audio without changing pitch
370
+ y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
371
+
372
+ # Save the output with the original sample rate
373
+ sf.write(output_file, y_stretched, sr)
374
+ # print(f"Speed up by {speedup_factor} completed successfully: {output_file}")
375
+
376
+ except Exception as e:
377
+ gr.Warning(f"Error during speedup with Librosa: {e}")
378
+ shutil.copy(input_file, output_file)
379
+
380
+
381
+
382
+
383
+ def is_ffmpeg_installed():
384
+ if platform.system() == "Windows":
385
+ local_ffmpeg_path = os.path.join("./ffmpeg", "ffmpeg.exe")
386
+ else:
387
+ local_ffmpeg_path = "ffmpeg"
388
+ try:
389
+ subprocess.run([local_ffmpeg_path, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
390
+ # print("FFmpeg is installed")
391
+ return True,local_ffmpeg_path
392
+ except (FileNotFoundError, subprocess.CalledProcessError):
393
+ # print("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing")
394
+ gr.Warning("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing",duration= 20)
395
+ return False,local_ffmpeg_path
396
+
397
+
398
+
399
+
400
+ # ffmpeg -i test.wav -filter:a "atempo=2.0" ffmpeg.wav -y
401
+ def change_speed(input_file, output_file, speedup_factor):
402
+ global use_ffmpeg,local_ffmpeg_path
403
+ if use_ffmpeg:
404
+ # print("Using FFmpeg for speedup")
405
+ try:
406
+ # subprocess.run([
407
+ # local_ffmpeg_path,
408
+ # "-i", input_file,
409
+ # "-filter:a", f"atempo={speedup_factor}",
410
+ # output_file,
411
+ # "-y"
412
+ # ], check=True)
413
+ subprocess.run([
414
+ local_ffmpeg_path,
415
+ "-i", input_file,
416
+ "-filter:a", f"atempo={speedup_factor}",
417
+ output_file,
418
+ "-y"
419
+ ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
420
+ except Exception as e:
421
+ gr.Error(f"Error during speedup with FFmpeg: {e}")
422
+ speedup_audio_librosa(input_file, output_file, speedup_factor)
423
+ else:
424
+ # print("Using Librosa for speedup")
425
+ speedup_audio_librosa(input_file, output_file, speedup_factor)
426
+
427
+
428
+
429
+
430
+
431
 
432
 
433
  class SRTDubbing:
 
450
  if tts_duration > actual_duration:
451
  speedup_factor = tts_duration / actual_duration
452
  speedup_filename = "./cache/speedup_temp.wav"
453
+ change_speed(tts_filename, speedup_filename, speedup_factor)
454
  # Use ffmpeg to change audio speed
455
+ # subprocess.run([
456
+ # "ffmpeg",
457
+ # "-i", tts_filename,
458
+ # "-filter:a", f"atempo={speedup_factor}",
459
+ # speedup_filename,
460
+ # "-y"
461
+ # ], check=True)
462
 
463
  # Replace the original TTS audio with the sped-up version
464
  shutil.move(speedup_filename, audio_path)
 
564
  with open("entries.json", "w") as file:
565
  json.dump(entries, file, indent=4)
566
  return entries
567
+ srt_voice_name="af"
568
+ use_ffmpeg,local_ffmpeg_path = is_ffmpeg_installed()
569
+ # use_ffmpeg=False
570
+
571
+ def srt_process(srt_file_path,voice_name,custom_voicepack=None,dest_language="en"):
572
+ global srt_voice_name,use_ffmpeg
573
+
574
+ if not srt_file_path.endswith(".srt"):
575
+ gr.Error("Please upload a valid .srt file",duration=5)
576
+ return None
577
+ if use_ffmpeg:
578
+ gr.Success("Using FFmpeg for audio speedup to sync with subtitle")
579
+ else:
580
+ gr.Warning("Install FFmpeg to ensure high-quality audio when speeding up the audio to sync with subtitle. Default Using 'librosa' for speedup",duration= 20)
581
+
582
+ if custom_voicepack:
583
+ if manage_files(custom_voicepack):
584
+ srt_voice_name = custom_voicepack
585
+ else:
586
+ srt_voice_name=voice_name
587
+ gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
588
  srt_dubbing = SRTDubbing()
589
  dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
590
  srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
 
601
 
602
  gr.Markdown(
603
  """
604
+ # Generate Audio File From Subtitle [Upload Only .srt file]
605
 
606
  To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
607
 
 
620
  )
621
  with gr.Row():
622
  generate_btn_ = gr.Button('Generate', variant='primary')
623
+
624
+ with gr.Accordion('Audio Settings', open=False):
625
+ custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
626
+
627
+
628
+
629
  with gr.Column():
630
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
631
  with gr.Accordion('Enable Autoplay', open=False):
 
639
  # )
640
  generate_btn_.click(
641
  srt_process,
642
+ inputs=[srt_file,voice,custom_voicepack],
643
  outputs=[audio]
644
  )
645
 
646
+
647
+
648
+ #### Voice mixing
649
+ # modified from here
650
+ # https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS
651
+ def get_voices():
652
+ voices = {}
653
+ for i in os.listdir("./KOKORO/voices"):
654
+ if i.endswith(".pt"):
655
+ voice_name = i.replace(".pt", "")
656
+ voices[voice_name] = torch.load(f"./KOKORO/voices/{i}", weights_only=True).to(device)
657
+
658
+ slider_configs = {}
659
+
660
+ # Iterate through the predefined list of voices
661
+ for i in voices:
662
+ # Handle the default case for "af"
663
+ if i == "af":
664
+ slider_configs["af"]= "Default 👩🇺🇸"
665
+ continue
666
+ if i == "af_nicole":
667
+ slider_configs["af_nicole"]="Nicole 😏🇺🇸"
668
+ continue
669
+ if i == "af_bella":
670
+ slider_configs["af_bella"]="Bella 🤗🇺🇸"
671
+ continue
672
+
673
+ # Determine the country emoji
674
+ country = "🇺🇸" if i.startswith("a") else "🇬🇧"
675
+
676
+ # Determine the gender emoji and name
677
+ if "f_" in i:
678
+ display_name = f"{i.split('_')[-1].capitalize()} 👩{country}"
679
+ elif "m_" in i:
680
+ display_name = f"{i.split('_')[-1].capitalize()} 👨{country}"
681
+ else:
682
+ display_name = f"{i.capitalize()} 😐"
683
+
684
+ # Append the voice tuple to the list
685
+ slider_configs[i]= display_name
686
+
687
+ return voices, slider_configs
688
+
689
+ voices, slider_configs = get_voices()
690
+
691
+
692
+ def parse_voice_formula(formula):
693
+ global voices
694
+ """Parse the voice formula string and return the combined voice tensor."""
695
+ if not formula.strip():
696
+ raise ValueError("Empty voice formula")
697
+
698
+ # Initialize the weighted sum
699
+ weighted_sum = None
700
+
701
+ # Split the formula into terms
702
+ terms = formula.split('+')
703
+ weights=0
704
+ for term in terms:
705
+ # Parse each term (format: "voice_name * 0.333")
706
+ parts = term.strip().split('*')
707
+ if len(parts) != 2:
708
+ raise ValueError(f"Invalid term format: {term.strip()}. Should be 'voice_name * weight'")
709
+
710
+ voice_name = parts[0].strip()
711
+ weight = float(parts[1].strip())
712
+ weights+=weight
713
+ # print(voice_name)
714
+ # print(weight)
715
+ # Get the voice tensor
716
+ if voice_name not in voices:
717
+ raise ValueError(f"Unknown voice: {voice_name}")
718
+
719
+ voice_tensor = voices[voice_name]
720
+
721
+ # Add to weighted sum
722
+ if weighted_sum is None:
723
+ weighted_sum = weight * voice_tensor
724
+ else:
725
+ weighted_sum += weight * voice_tensor
726
+ return weighted_sum/weights
727
+
728
+
729
+
730
+
731
+
732
+
733
+
734
+ def get_new_voice(formula):
735
+ # print(formula)
736
+ try:
737
+ # Parse the formula and get the combined voice tensor
738
+ weighted_voices = parse_voice_formula(formula)
739
+ voice_pack_name = "./weighted_normalised_voices.pt"
740
+ # Save and load the combined voice
741
+ torch.save(weighted_voices, voice_pack_name)
742
+ # print(f"Voice pack saved at: {voice_pack_name}")
743
+ return voice_pack_name
744
+ except Exception as e:
745
+ raise gr.Error(f"Failed to create voice: {str(e)}")
746
+
747
+
748
+ def generate_voice_formula(*values):
749
+ """
750
+ Generate a formatted string showing the normalized voice combination.
751
+ Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
752
+ """
753
+ n = len(values) // 2
754
+ checkbox_values = values[:n]
755
+ slider_values = list(values[n:])
756
+ global slider_configs
757
+ # Get active sliders and their names
758
+ active_pairs = [(slider_values[i], slider_configs[i][0])
759
+ for i in range(len(slider_configs))
760
+ if checkbox_values[i]]
761
+
762
+ if not active_pairs:
763
+ return ""
764
+
765
+ # If only one voice is selected, use its actual value
766
+ if len(active_pairs) == 1:
767
+ value, name = active_pairs[0]
768
+ return f"{value:.3f} * {name}"
769
+
770
+ # Calculate sum for normalization of multiple voices
771
+ total_sum = sum(value for value, _ in active_pairs)
772
+
773
+ if total_sum == 0:
774
+ return ""
775
+
776
+ # Generate normalized formula for multiple voices
777
+ terms = []
778
+ for value, name in active_pairs:
779
+ normalized_value = value / total_sum
780
+ terms.append(f"{normalized_value:.3f} * {name}")
781
+
782
+ return " + ".join(terms)
783
 
784
+
785
+
786
+
787
+
788
+ def create_voice_mix_ui():
789
+ with gr.Blocks() as demo:
790
+ gr.Markdown(
791
+ """
792
+ # Kokoro Voice Mixer
793
+ Select voices and adjust their weights to create a mixed voice.
794
+ """
795
+ )
796
+
797
+ voice_components = {}
798
+ voice_names = list(voices.keys())
799
+ female_voices = [name for name in voice_names if "f_" in name]
800
+ male_voices = [name for name in voice_names if "b_" in name]
801
+ neutral_voices = [name for name in voice_names if "f_" not in name and "b_" not in name]
802
+
803
+ # Define how many columns you want
804
+ num_columns = 3
805
+
806
+ # Function to generate UI
807
+ def generate_ui_row(voice_list):
808
+ num_voices = len(voice_list)
809
+ num_rows = (num_voices + num_columns - 1) // num_columns
810
+ for i in range(num_rows):
811
+ with gr.Row():
812
+ for j in range(num_columns):
813
+ index = i * num_columns + j
814
+ if index < num_voices:
815
+ voice_name = voice_list[index]
816
+ with gr.Column():
817
+ checkbox = gr.Checkbox(label=slider_configs[voice_name])
818
+ weight_slider = gr.Slider(
819
+ minimum=0,
820
+ maximum=1,
821
+ value=1.0,
822
+ step=0.01,
823
+ interactive=False
824
+ )
825
+ voice_components[voice_name] = (checkbox, weight_slider)
826
+ checkbox.change(
827
+ lambda x, slider=weight_slider: gr.update(interactive=x),
828
+ inputs=[checkbox],
829
+ outputs=[weight_slider]
830
+ )
831
+
832
+ generate_ui_row(female_voices)
833
+ generate_ui_row(male_voices)
834
+ generate_ui_row(neutral_voices)
835
+
836
+ formula_inputs = []
837
+ for i in voice_components:
838
+ checkbox, slider = voice_components[i]
839
+ formula_inputs.append(checkbox)
840
+ formula_inputs.append(slider)
841
+
842
+ with gr.Row():
843
+ voice_formula = gr.Textbox(label="Voice Formula", interactive=False)
844
+
845
+ # Function to dynamically update the voice formula
846
+ def update_voice_formula(*args):
847
+ formula_parts = []
848
+ for i, (checkbox, slider) in enumerate(voice_components.values()):
849
+ if args[i * 2]: # If checkbox is selected
850
+ formula_parts.append(f"{list(voice_components.keys())[i]} * {args[i * 2 + 1]:.3f}")
851
+ return " + ".join(formula_parts)
852
+
853
+
854
+ # Update formula whenever any checkbox or slider changes
855
+ for checkbox, slider in voice_components.values():
856
+ checkbox.change(
857
+ update_voice_formula,
858
+ inputs=formula_inputs,
859
+ outputs=[voice_formula]
860
+ )
861
+ slider.change(
862
+ update_voice_formula,
863
+ inputs=formula_inputs,
864
+ outputs=[voice_formula]
865
+ )
866
+
867
+ with gr.Row():
868
+ voice_text = gr.Textbox(
869
+ label='Enter Text',
870
+ lines=3,
871
+ placeholder="Type your text here to preview the custom voice..."
872
+ )
873
+ voice_generator = gr.Button('Generate', variant='primary')
874
+ with gr.Accordion('Audio Settings', open=False):
875
+ model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
876
+ speed = gr.Slider(
877
+ minimum=0.25, maximum=2, value=1, step=0.1,
878
+ label='⚡️Speed', info='Adjust the speaking speed'
879
+ )
880
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
881
+ with gr.Row():
882
+ voice_audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
883
+ with gr.Row():
884
+ mix_voice_download = gr.File(label="Download VoicePack")
885
+ with gr.Accordion('Enable Autoplay', open=False):
886
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
887
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[voice_audio])
888
+ def generate_custom_audio(text_input, formula_text, model_name, speed, remove_silence):
889
+ try:
890
+ new_voice_pack = get_new_voice(formula_text)
891
+ audio_output_path =text_to_speech(text=text_input, model_name=model_name, voice_name="af", speed=speed, pad_between_segments=0, remove_silence=remove_silence, minimum_silence=0.05,custom_voicepack=new_voice_pack,trim=0.0)
892
+ # audio_output_path = text_to_speech(text=text_input, model_name=model_name,voice_name="af", speed=1.0, custom_voicepack=new_voice_pack)
893
+ return audio_output_path,new_voice_pack
894
+ except Exception as e:
895
+ raise gr.Error(f"Failed to generate audio: {e}")
896
+
897
+
898
+ voice_generator.click(
899
+ generate_custom_audio,
900
+ inputs=[voice_text, voice_formula,model_name,speed,remove_silence],
901
+ outputs=[voice_audio,mix_voice_download]
902
+ )
903
+ return demo
904
+
905
+ demo4 = create_voice_mix_ui()
906
+
907
+
908
+
909
+
910
+
911
+
912
+
913
+
914
  display_text = " \n".join(voice_list)
915
 
916
+ with gr.Blocks() as demo5:
 
917
  gr.Markdown(f"# Voice Names \n{display_text}")
918
+
919
+
920
 
921
 
922
+ import click
923
+ @click.command()
924
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
925
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
926
+ def main(debug, share):
927
+ demo = gr.TabbedInterface([demo1, demo2,demo3,demo4,demo5], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Voice Mix","Available Voice Names"],title="Kokoro TTS")
928
 
929
  demo.queue().launch(debug=debug, share=share)
930
  #Run on local network
 
957
 
958
  # save_at=f"./temp_audio/{os.path.basename(result)}"
959
  # shutil.move(result, save_at)
960
+ # print(f"Saved at {save_at}")