cocktailpeanut commited on
Commit
e90071b
·
1 Parent(s): 04df9c3
Files changed (2) hide show
  1. app_locally.py +18 -145
  2. requirements_locally.txt +17 -17
app_locally.py CHANGED
@@ -2,101 +2,25 @@ import os
2
  import torch
3
  import argparse
4
  import gradio as gr
5
- from zipfile import ZipFile
6
- import langid
7
-
8
-
9
- parser = argparse.ArgumentParser()
10
- parser.add_argument("--online_checkpoint_url", default="https://myshell-public-repo-hosting.s3.amazonaws.com/checkpoints_1226.zip")
11
- parser.add_argument("--share", action='store_true', default=False, help="make link public")
12
- args = parser.parse_args()
13
-
14
- # first download the checkpoints from server
15
- if not os.path.exists('checkpoints/'):
16
- print('Downloading OpenVoice checkpoint ...')
17
- os.system(f'wget {args.online_checkpoint_url} -O ckpt.zip')
18
- print('Extracting OpenVoice checkpoint ...')
19
- ZipFile("ckpt.zip").extractall()
20
 
21
  # Init EN/ZH baseTTS and ToneConvertor
22
  from OpenVoice import se_extractor
23
  from OpenVoice.api import BaseSpeakerTTS, ToneColorConverter
 
24
 
25
- en_ckpt_base = 'checkpoints/base_speakers/EN'
26
- zh_ckpt_base = 'checkpoints/base_speakers/ZH'
27
- ckpt_converter = 'checkpoints/converter'
28
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
29
- output_dir = 'outputs'
30
- os.makedirs(output_dir, exist_ok=True)
31
- en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
32
- en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
33
- zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
34
- zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
35
- tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
36
- tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
37
- en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
38
- en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
39
- zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
40
 
41
- supported_languages = ['zh', 'en']
42
-
43
- def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
44
  # initialize a empty info
45
  text_hint = ''
46
- # agree with the terms
47
- if agree == False:
48
- text_hint += '[ERROR] Please accept the Terms & Condition!\n'
49
- gr.Warning("Please accept the Terms & Condition!")
50
- return (
51
- text_hint,
52
- None,
53
- None,
54
- )
55
 
56
- # first detect the input language
57
- language_predicted = langid.classify(prompt)[0].strip()
58
- print(f"Detected language:{language_predicted}")
59
-
60
- if language_predicted not in supported_languages:
61
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
62
- gr.Warning(
63
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
64
- )
65
 
66
- return (
67
- text_hint,
68
- None,
69
- None,
70
- )
71
-
72
- if language_predicted == "zh":
73
- tts_model = zh_base_speaker_tts
74
- source_se = zh_source_se
75
- language = 'Chinese'
76
- if style not in ['default']:
77
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
78
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
79
- return (
80
- text_hint,
81
- None,
82
- None,
83
- )
84
-
85
- else:
86
- tts_model = en_base_speaker_tts
87
- if style == 'default':
88
- source_se = en_source_default_se
89
- else:
90
- source_se = en_source_style_se
91
- language = 'English'
92
- if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
93
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
94
- gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
95
- return (
96
- text_hint,
97
- None,
98
- None,
99
- )
100
 
101
  if use_mic == True:
102
  if mic_file_path is not None:
@@ -123,16 +47,6 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
123
  None,
124
  None,
125
  )
126
- # if len(prompt) > 200:
127
- # text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
128
- # gr.Warning(
129
- # "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
130
- # )
131
- # return (
132
- # text_hint,
133
- # None,
134
- # None,
135
- # )
136
 
137
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
138
  try:
@@ -150,6 +64,9 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
150
  )
151
 
152
  src_path = f'{output_dir}/tmp.wav'
 
 
 
153
  tts_model.tts(prompt, src_path, speaker=style, language=language)
154
 
155
  save_path = f'{output_dir}/output.wav'
@@ -171,46 +88,6 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
171
  )
172
 
173
 
174
-
175
- title = "MyShell OpenVoice"
176
-
177
- description = """
178
- We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
179
- """
180
-
181
- markdown_table = """
182
- <div align="center" style="margin-bottom: 10px;">
183
-
184
- | | | |
185
- | :-----------: | :-----------: | :-----------: |
186
- | **OpenSource Repo** | **Project Page** | **Join the Community** |
187
- | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
188
-
189
- </div>
190
- """
191
-
192
- markdown_table_v2 = """
193
- <div align="center" style="margin-bottom: 2px;">
194
-
195
- | | | | |
196
- | :-----------: | :-----------: | :-----------: | :-----------: |
197
- | **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
198
-
199
- | | |
200
- | :-----------: | :-----------: |
201
- **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
202
-
203
- </div>
204
- """
205
- content = """
206
- <div>
207
- <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
208
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
209
- </div>
210
- """
211
- wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
212
-
213
-
214
  examples = [
215
  [
216
  "今天天气真好,我们一起出去吃饭吧。",
@@ -239,8 +116,8 @@ examples = [
239
 
240
  with gr.Blocks(analytics_enabled=False) as demo:
241
 
242
- with gr.Row():
243
- gr.HTML(wrapped_markdown_content)
244
 
245
  with gr.Row():
246
  with gr.Column():
@@ -273,11 +150,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
273
  value=False,
274
  info="Notice: Microphone input may not work properly under traffic",
275
  )
276
- tos_gr = gr.Checkbox(
277
- label="Agree",
278
- value=False,
279
- info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
280
- )
281
 
282
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
283
 
@@ -289,11 +162,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
289
 
290
  gr.Examples(examples,
291
  label="Examples",
292
- inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, tos_gr],
293
  outputs=[out_text_gr, audio_gr, ref_audio_gr],
294
  fn=predict,
295
  cache_examples=False,)
296
- tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
297
 
298
  demo.queue()
299
- demo.launch(debug=True, show_api=True, share=args.share)
 
2
  import torch
3
  import argparse
4
  import gradio as gr
5
+ #from zipfile import ZipFile
6
+ from melo.api import TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Init EN/ZH baseTTS and ToneConvertor
9
  from OpenVoice import se_extractor
10
  from OpenVoice.api import BaseSpeakerTTS, ToneColorConverter
11
+ import devicetorch
12
 
13
+ device = devicetorch.get(torch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
 
 
16
  # initialize a empty info
17
  text_hint = ''
 
 
 
 
 
 
 
 
 
18
 
19
+ tts_model = TTS(language=language, device=device)
20
+ speaker_id = models[language].hps.data.spk2id
21
+ speaker_key = speaker_key.lower().replace('_', '-')
22
+ source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
 
 
 
 
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  if use_mic == True:
26
  if mic_file_path is not None:
 
47
  None,
48
  None,
49
  )
 
 
 
 
 
 
 
 
 
 
50
 
51
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
52
  try:
 
64
  )
65
 
66
  src_path = f'{output_dir}/tmp.wav'
67
+
68
+ speed = 1.0
69
+
70
  tts_model.tts(prompt, src_path, speaker=style, language=language)
71
 
72
  save_path = f'{output_dir}/output.wav'
 
88
  )
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  examples = [
92
  [
93
  "今天天气真好,我们一起出去吃饭吧。",
 
116
 
117
  with gr.Blocks(analytics_enabled=False) as demo:
118
 
119
+ # with gr.Row():
120
+ # gr.HTML(wrapped_markdown_content)
121
 
122
  with gr.Row():
123
  with gr.Column():
 
150
  value=False,
151
  info="Notice: Microphone input may not work properly under traffic",
152
  )
153
+ language = gr.Radio(['EN_NEWEST', 'EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN_NEWEST')
 
 
 
 
154
 
155
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
156
 
 
162
 
163
  gr.Examples(examples,
164
  label="Examples",
165
+ inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
166
  outputs=[out_text_gr, audio_gr, ref_audio_gr],
167
  fn=predict,
168
  cache_examples=False,)
169
+ tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
170
 
171
  demo.queue()
172
+ demo.launch(debug=True, show_api=True)
requirements_locally.txt CHANGED
@@ -1,18 +1,18 @@
1
- langid
2
- librosa==0.9.1
3
- faster-whisper
4
- pydub==0.25.1
5
- #wavmark==0.0.2
6
- wavmark
7
- #numpy==1.22.0
8
- numpy
9
- eng_to_ipa==0.0.2
10
- inflect==7.0.0
11
- unidecode==1.3.7
12
- whisper-timestamped
13
- openai
14
- python-dotenv
15
- pypinyin==0.50.0
16
- cn2an==0.5.22
17
- jieba==0.42.1
18
  gradio==3.48.0
 
1
+ #langid
2
+ #librosa==0.9.1
3
+ #faster-whisper
4
+ #pydub==0.25.1
5
+ ##wavmark==0.0.2
6
+ #wavmark
7
+ ##numpy==1.22.0
8
+ #numpy
9
+ #eng_to_ipa==0.0.2
10
+ #inflect==7.0.0
11
+ #unidecode==1.3.7
12
+ #whisper-timestamped
13
+ #openai
14
+ #python-dotenv
15
+ #pypinyin==0.50.0
16
+ #cn2an==0.5.22
17
+ #jieba==0.42.1
18
  gradio==3.48.0