Dupaja commited on
Commit
c6292d7
·
1 Parent(s): 3f96c9f

Add ljspeech endpoint

Browse files
Files changed (1) hide show
  1. app.py +43 -12
app.py CHANGED
@@ -1,17 +1,10 @@
1
  import gradio as gr
2
- import styletts2importable
 
3
  import numpy as np
 
4
  import re
5
 
6
- theme = gr.themes.Base(
7
- font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
8
- )
9
- voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
10
- voices = {}
11
- import phonemizer
12
- global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
13
- for v in voicelist:
14
- voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
15
  def split_and_recombine_text(text, desired_length=200, max_length=400):
16
  """Split text it into chunks of a desired length trying to keep sentences intact."""
17
  # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
@@ -83,6 +76,10 @@ def split_and_recombine_text(text, desired_length=200, max_length=400):
83
 
84
  return rv
85
 
 
 
 
 
86
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
87
  if text.strip() == "":
88
  raise gr.Error("You must enter some text")
@@ -95,15 +92,49 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
95
  return (24000, np.concatenate(audios))
96
 
97
 
98
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
99
  with gr.Row():
100
  with gr.Column(scale=1):
101
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
102
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
103
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
 
104
  with gr.Column(scale=1):
105
  btn = gr.Button("Synthesize", variant="primary")
106
  audio = gr.Audio(interactive=False, label="Synthesized Audio")
107
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
108
 
109
- demo.queue(api_open=True, max_size=15).launch(show_api=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import ljspeechimportable
3
+ import torch
4
  import numpy as np
5
+ import styletts2importable
6
  import re
7
 
 
 
 
 
 
 
 
 
 
8
  def split_and_recombine_text(text, desired_length=200, max_length=400):
9
  """Split text it into chunks of a desired length trying to keep sentences intact."""
10
  # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
 
76
 
77
  return rv
78
 
79
+ theme = gr.themes.Base(
80
+ font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
81
+ )
82
+
83
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
84
  if text.strip() == "":
85
  raise gr.Error("You must enter some text")
 
92
  return (24000, np.concatenate(audios))
93
 
94
 
95
+ def ljsynthesize(text, steps, progress=gr.Progress()):
96
+ noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
97
+ if text.strip() == "":
98
+ raise gr.Error("You must enter some text")
99
+ texts = split_and_recombine_text(text)
100
+ audios = []
101
+ for t in progress.tqdm(texts):
102
+ audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
103
+ return (24000, np.concatenate(audios))
104
+
105
+ with gr.Blocks() as libritts: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
106
  with gr.Row():
107
  with gr.Column(scale=1):
108
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
109
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
110
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
111
+ # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
112
  with gr.Column(scale=1):
113
  btn = gr.Button("Synthesize", variant="primary")
114
  audio = gr.Audio(interactive=False, label="Synthesized Audio")
115
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
116
 
117
+ with gr.Blocks() as lj:
118
+ with gr.Row():
119
+ with gr.Column(scale=1):
120
+ ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
121
+ ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
122
+ with gr.Column(scale=1):
123
+ ljbtn = gr.Button("Synthesize", variant="primary")
124
+ ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
125
+ ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
126
+
127
+ with gr.Blocks(title="StyleTTS 2", css="", theme=theme) as demo:
128
+ gr.DuplicateButton("Duplicate Space")
129
+ gr.TabbedInterface([libritts, lj], ['Multi-Voice', 'LJSpeech'])
130
+ gr.Markdown("""
131
+ Original Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
132
+ Run this demo locally using Docker:
133
+ ```bash
134
+ docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all registry.hf.space/styletts2-styletts2:latest python app.py
135
+ ```
136
+ """) # Please do not remove this line.
137
+
138
+ if __name__ == "__main__":
139
+ # demo.queue(api_open=False, max_size=15).launch(show_api=False)
140
+ demo.queue(api_open=True, max_size=15).launch(show_api=True)