hexgrad commited on
Commit
c70c0b7
·
verified ·
1 Parent(s): 9b9ab2a

Upload 2 files

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +13 -8
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Kokoro TTS
3
- emoji: 🔊
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
 
1
  ---
2
+ title: Kokoro TTS v0.19
3
+ emoji: 🔊♥️🔊
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
app.py CHANGED
@@ -108,16 +108,17 @@ VOCAB = get_vocab()
108
  def tokenize(ps):
109
  return [i for i in map(VOCAB.get, ps) if i is not None]
110
 
111
- # ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
112
  CHOICES = {
113
  '🇺🇸 🚺 American Female ⭐': 'af',
114
  '🇺🇸 🚺 Bella': 'af_bella',
 
115
  '🇺🇸 🚺 Sarah': 'af_sarah',
116
  '🇺🇸 🚺 Sky 🧪': 'af_sky',
117
  '🇺🇸 🚹 Adam 🧪': 'am_adam',
118
  '🇺🇸 🚹 Michael': 'am_michael',
119
- '🇬🇧 🚹 Lewis': 'bm_lewis',
120
- '🇯🇵 🚺 Japanese Female 🧪': 'jf_0',
121
  }
122
  VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
123
 
@@ -159,7 +160,7 @@ def forward(tokens, voice, speed):
159
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
160
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
161
 
162
- def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
163
  if voice not in VOICES:
164
  # Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
165
  voice = 'af'
@@ -204,7 +205,7 @@ with gr.Blocks() as basic_tts:
204
  with gr.Row():
205
  with gr.Column():
206
  text = gr.Textbox(label='Input Text')
207
- voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.')
208
  with gr.Row():
209
  random_btn = gr.Button('Random Text', variant='secondary')
210
  generate_btn = gr.Button('Generate', variant='primary')
@@ -236,9 +237,9 @@ with gr.Blocks() as basic_tts:
236
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
237
  with gr.Row():
238
  with gr.Column():
239
- pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
240
  with gr.Column():
241
- pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
242
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
243
  text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
244
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
@@ -388,7 +389,7 @@ with gr.Blocks() as lf_tts:
388
  file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
389
  text = gr.Textbox(label='Input Text')
390
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
391
- voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.')
392
  with gr.Accordion('Text Settings', open=False):
393
  skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
394
  newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
@@ -434,6 +435,10 @@ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](http
434
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
435
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
436
 
 
 
 
 
437
  ### Licenses
438
  Inference code: MIT<br/>
439
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
 
108
  def tokenize(ps):
109
  return [i for i in map(VOCAB.get, ps) if i is not None]
110
 
111
+ # ⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.
112
  CHOICES = {
113
  '🇺🇸 🚺 American Female ⭐': 'af',
114
  '🇺🇸 🚺 Bella': 'af_bella',
115
+ '🇺🇸 🚺 Nicole': 'af_nicole',
116
  '🇺🇸 🚺 Sarah': 'af_sarah',
117
  '🇺🇸 🚺 Sky 🧪': 'af_sky',
118
  '🇺🇸 🚹 Adam 🧪': 'am_adam',
119
  '🇺🇸 🚹 Michael': 'am_michael',
120
+ '🇬🇧 🚹 Lewis 🧪': 'bm_lewis',
121
+ '🇯🇵 🚺 Japanese Female': 'jf_0',
122
  }
123
  VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
124
 
 
160
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
161
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
162
 
163
+ def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=0, pad_after=0):
164
  if voice not in VOICES:
165
  # Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
166
  voice = 'af'
 
205
  with gr.Row():
206
  with gr.Column():
207
  text = gr.Textbox(label='Input Text')
208
+ voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
209
  with gr.Row():
210
  random_btn = gr.Button('Random Text', variant='secondary')
211
  generate_btn = gr.Button('Generate', variant='primary')
 
237
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
238
  with gr.Row():
239
  with gr.Column():
240
+ pad_before = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
241
  with gr.Column():
242
+ pad_after = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
243
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
244
  text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
245
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
 
389
  file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
390
  text = gr.Textbox(label='Input Text')
391
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
392
+ voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
393
  with gr.Accordion('Text Settings', open=False):
394
  skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
395
  newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
 
435
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
436
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
437
 
438
+ ### Voice Stability
439
+ ⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.<br/>
440
+ Unstable voices may be more likely to stumble or produce unnatural artifacts, especially on shorter texts.
441
+
442
  ### Licenses
443
  Inference code: MIT<br/>
444
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>