Spaces:
Running
Running
cocktailpeanut
commited on
Commit
·
e90071b
1
Parent(s):
04df9c3
update
Browse files- app_locally.py +18 -145
- requirements_locally.txt +17 -17
app_locally.py
CHANGED
@@ -2,101 +2,25 @@ import os
|
|
2 |
import torch
|
3 |
import argparse
|
4 |
import gradio as gr
|
5 |
-
from zipfile import ZipFile
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
parser = argparse.ArgumentParser()
|
10 |
-
parser.add_argument("--online_checkpoint_url", default="https://myshell-public-repo-hosting.s3.amazonaws.com/checkpoints_1226.zip")
|
11 |
-
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
12 |
-
args = parser.parse_args()
|
13 |
-
|
14 |
-
# first download the checkpoints from server
|
15 |
-
if not os.path.exists('checkpoints/'):
|
16 |
-
print('Downloading OpenVoice checkpoint ...')
|
17 |
-
os.system(f'wget {args.online_checkpoint_url} -O ckpt.zip')
|
18 |
-
print('Extracting OpenVoice checkpoint ...')
|
19 |
-
ZipFile("ckpt.zip").extractall()
|
20 |
|
21 |
# Init EN/ZH baseTTS and ToneConvertor
|
22 |
from OpenVoice import se_extractor
|
23 |
from OpenVoice.api import BaseSpeakerTTS, ToneColorConverter
|
|
|
24 |
|
25 |
-
|
26 |
-
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
27 |
-
ckpt_converter = 'checkpoints/converter'
|
28 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
29 |
-
output_dir = 'outputs'
|
30 |
-
os.makedirs(output_dir, exist_ok=True)
|
31 |
-
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
|
32 |
-
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
|
33 |
-
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
|
34 |
-
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
|
35 |
-
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
36 |
-
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
37 |
-
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
|
38 |
-
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
|
39 |
-
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
|
44 |
# initialize a empty info
|
45 |
text_hint = ''
|
46 |
-
# agree with the terms
|
47 |
-
if agree == False:
|
48 |
-
text_hint += '[ERROR] Please accept the Terms & Condition!\n'
|
49 |
-
gr.Warning("Please accept the Terms & Condition!")
|
50 |
-
return (
|
51 |
-
text_hint,
|
52 |
-
None,
|
53 |
-
None,
|
54 |
-
)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
if language_predicted not in supported_languages:
|
61 |
-
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
62 |
-
gr.Warning(
|
63 |
-
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
|
64 |
-
)
|
65 |
|
66 |
-
return (
|
67 |
-
text_hint,
|
68 |
-
None,
|
69 |
-
None,
|
70 |
-
)
|
71 |
-
|
72 |
-
if language_predicted == "zh":
|
73 |
-
tts_model = zh_base_speaker_tts
|
74 |
-
source_se = zh_source_se
|
75 |
-
language = 'Chinese'
|
76 |
-
if style not in ['default']:
|
77 |
-
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
78 |
-
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
79 |
-
return (
|
80 |
-
text_hint,
|
81 |
-
None,
|
82 |
-
None,
|
83 |
-
)
|
84 |
-
|
85 |
-
else:
|
86 |
-
tts_model = en_base_speaker_tts
|
87 |
-
if style == 'default':
|
88 |
-
source_se = en_source_default_se
|
89 |
-
else:
|
90 |
-
source_se = en_source_style_se
|
91 |
-
language = 'English'
|
92 |
-
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
|
93 |
-
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
|
94 |
-
gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
|
95 |
-
return (
|
96 |
-
text_hint,
|
97 |
-
None,
|
98 |
-
None,
|
99 |
-
)
|
100 |
|
101 |
if use_mic == True:
|
102 |
if mic_file_path is not None:
|
@@ -123,16 +47,6 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
|
|
123 |
None,
|
124 |
None,
|
125 |
)
|
126 |
-
# if len(prompt) > 200:
|
127 |
-
# text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
|
128 |
-
# gr.Warning(
|
129 |
-
# "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
|
130 |
-
# )
|
131 |
-
# return (
|
132 |
-
# text_hint,
|
133 |
-
# None,
|
134 |
-
# None,
|
135 |
-
# )
|
136 |
|
137 |
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
138 |
try:
|
@@ -150,6 +64,9 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
|
|
150 |
)
|
151 |
|
152 |
src_path = f'{output_dir}/tmp.wav'
|
|
|
|
|
|
|
153 |
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
154 |
|
155 |
save_path = f'{output_dir}/output.wav'
|
@@ -171,46 +88,6 @@ def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, agree):
|
|
171 |
)
|
172 |
|
173 |
|
174 |
-
|
175 |
-
title = "MyShell OpenVoice"
|
176 |
-
|
177 |
-
description = """
|
178 |
-
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
179 |
-
"""
|
180 |
-
|
181 |
-
markdown_table = """
|
182 |
-
<div align="center" style="margin-bottom: 10px;">
|
183 |
-
|
184 |
-
| | | |
|
185 |
-
| :-----------: | :-----------: | :-----------: |
|
186 |
-
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
187 |
-
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
188 |
-
|
189 |
-
</div>
|
190 |
-
"""
|
191 |
-
|
192 |
-
markdown_table_v2 = """
|
193 |
-
<div align="center" style="margin-bottom: 2px;">
|
194 |
-
|
195 |
-
| | | | |
|
196 |
-
| :-----------: | :-----------: | :-----------: | :-----------: |
|
197 |
-
| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
198 |
-
|
199 |
-
| | |
|
200 |
-
| :-----------: | :-----------: |
|
201 |
-
**Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
202 |
-
|
203 |
-
</div>
|
204 |
-
"""
|
205 |
-
content = """
|
206 |
-
<div>
|
207 |
-
<strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
208 |
-
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
209 |
-
</div>
|
210 |
-
"""
|
211 |
-
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
212 |
-
|
213 |
-
|
214 |
examples = [
|
215 |
[
|
216 |
"今天天气真好,我们一起出去吃饭吧。",
|
@@ -239,8 +116,8 @@ examples = [
|
|
239 |
|
240 |
with gr.Blocks(analytics_enabled=False) as demo:
|
241 |
|
242 |
-
with gr.Row():
|
243 |
-
gr.HTML(wrapped_markdown_content)
|
244 |
|
245 |
with gr.Row():
|
246 |
with gr.Column():
|
@@ -273,11 +150,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
273 |
value=False,
|
274 |
info="Notice: Microphone input may not work properly under traffic",
|
275 |
)
|
276 |
-
|
277 |
-
label="Agree",
|
278 |
-
value=False,
|
279 |
-
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
280 |
-
)
|
281 |
|
282 |
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
283 |
|
@@ -289,11 +162,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
289 |
|
290 |
gr.Examples(examples,
|
291 |
label="Examples",
|
292 |
-
inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr,
|
293 |
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
294 |
fn=predict,
|
295 |
cache_examples=False,)
|
296 |
-
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr,
|
297 |
|
298 |
demo.queue()
|
299 |
-
demo.launch(debug=True, show_api=True
|
|
|
2 |
import torch
|
3 |
import argparse
|
4 |
import gradio as gr
|
5 |
+
#from zipfile import ZipFile
|
6 |
+
from melo.api import TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Init EN/ZH baseTTS and ToneConvertor
|
9 |
from OpenVoice import se_extractor
|
10 |
from OpenVoice.api import BaseSpeakerTTS, ToneColorConverter
|
11 |
+
import devicetorch
|
12 |
|
13 |
+
device = devicetorch.get(torch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
|
|
|
|
|
16 |
# initialize a empty info
|
17 |
text_hint = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
tts_model = TTS(language=language, device=device)
|
20 |
+
speaker_id = models[language].hps.data.spk2id
|
21 |
+
speaker_key = speaker_key.lower().replace('_', '-')
|
22 |
+
source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
|
|
|
|
|
|
|
|
|
|
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
if use_mic == True:
|
26 |
if mic_file_path is not None:
|
|
|
47 |
None,
|
48 |
None,
|
49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
52 |
try:
|
|
|
64 |
)
|
65 |
|
66 |
src_path = f'{output_dir}/tmp.wav'
|
67 |
+
|
68 |
+
speed = 1.0
|
69 |
+
|
70 |
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
71 |
|
72 |
save_path = f'{output_dir}/output.wav'
|
|
|
88 |
)
|
89 |
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
examples = [
|
92 |
[
|
93 |
"今天天气真好,我们一起出去吃饭吧。",
|
|
|
116 |
|
117 |
with gr.Blocks(analytics_enabled=False) as demo:
|
118 |
|
119 |
+
# with gr.Row():
|
120 |
+
# gr.HTML(wrapped_markdown_content)
|
121 |
|
122 |
with gr.Row():
|
123 |
with gr.Column():
|
|
|
150 |
value=False,
|
151 |
info="Notice: Microphone input may not work properly under traffic",
|
152 |
)
|
153 |
+
language = gr.Radio(['EN_NEWEST', 'EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN_NEWEST')
|
|
|
|
|
|
|
|
|
154 |
|
155 |
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
156 |
|
|
|
162 |
|
163 |
gr.Examples(examples,
|
164 |
label="Examples",
|
165 |
+
inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
|
166 |
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
167 |
fn=predict,
|
168 |
cache_examples=False,)
|
169 |
+
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
170 |
|
171 |
demo.queue()
|
172 |
+
demo.launch(debug=True, show_api=True)
|
requirements_locally.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
-
langid
|
2 |
-
librosa==0.9.1
|
3 |
-
faster-whisper
|
4 |
-
pydub==0.25.1
|
5 |
-
|
6 |
-
wavmark
|
7 |
-
|
8 |
-
numpy
|
9 |
-
eng_to_ipa==0.0.2
|
10 |
-
inflect==7.0.0
|
11 |
-
unidecode==1.3.7
|
12 |
-
whisper-timestamped
|
13 |
-
openai
|
14 |
-
python-dotenv
|
15 |
-
pypinyin==0.50.0
|
16 |
-
cn2an==0.5.22
|
17 |
-
jieba==0.42.1
|
18 |
gradio==3.48.0
|
|
|
1 |
+
#langid
|
2 |
+
#librosa==0.9.1
|
3 |
+
#faster-whisper
|
4 |
+
#pydub==0.25.1
|
5 |
+
##wavmark==0.0.2
|
6 |
+
#wavmark
|
7 |
+
##numpy==1.22.0
|
8 |
+
#numpy
|
9 |
+
#eng_to_ipa==0.0.2
|
10 |
+
#inflect==7.0.0
|
11 |
+
#unidecode==1.3.7
|
12 |
+
#whisper-timestamped
|
13 |
+
#openai
|
14 |
+
#python-dotenv
|
15 |
+
#pypinyin==0.50.0
|
16 |
+
#cn2an==0.5.22
|
17 |
+
#jieba==0.42.1
|
18 |
gradio==3.48.0
|