B2BMGMT_ov2 / app_locally.py
cocktailpeanut's picture
update
87ed940
import os
import torch
import argparse
import gradio as gr
#from zipfile import ZipFile
from melo.api import TTS
# Init EN/ZH baseTTS and ToneConvertor
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
import devicetorch
print(f"openvoice = {dir(openvoice)}")
device = devicetorch.get(torch)
ckpt_converter = 'checkpoints/converter'
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
# initialize a empty info
text_hint = ''
tts_model = TTS(language=language, device=device)
speaker_id = models[language].hps.data.spk2id
speaker_key = speaker_key.lower().replace('_', '-')
source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
if use_mic == True:
if mic_file_path is not None:
speaker_wav = mic_file_path
else:
text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
gr.Warning(
"Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
)
return (
text_hint,
None,
None,
)
else:
speaker_wav = audio_file_pth
if len(prompt) < 2:
text_hint += f"[ERROR] Please give a longer prompt text \n"
gr.Warning("Please give a longer prompt text")
return (
text_hint,
None,
None,
)
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
try:
target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
# os.system(f'rm -rf {wavs_folder}')
except Exception as e:
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
gr.Warning(
"[ERROR] Get target tone color error {str(e)} \n"
)
return (
text_hint,
None,
None,
)
src_path = f'{output_dir}/tmp.wav'
speed = 1.0
tts_model.tts(prompt, src_path, speaker=style, language=language)
save_path = f'{output_dir}/output.wav'
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
text_hint += f'''Get response successfully \n'''
return (
text_hint,
save_path,
speaker_wav,
)
examples = [
[
"今天天气真好,我们一起出去吃饭吧。",
'default',
"examples/speaker0.mp3",
None,
False,
True,
],[
"This audio is generated by open voice with a half-performance model.",
'whispering',
"examples/speaker1.mp3",
None,
False,
True,
],
[
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
'sad',
"examples/speaker2.mp3",
None,
False,
True,
],
]
with gr.Blocks(analytics_enabled=False) as demo:
# with gr.Row():
# gr.HTML(wrapped_markdown_content)
with gr.Row():
with gr.Column():
input_text_gr = gr.Textbox(
label="Text Prompt",
info="One or two sentences at a time is better. Up to 200 text characters.",
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
)
style_gr = gr.Dropdown(
label="Style",
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
max_choices=1,
value="default",
)
ref_gr = gr.Audio(
label="Reference Audio",
info="Click on the ✎ button to upload your own target speaker audio",
type="filepath",
value="examples/speaker0.mp3",
)
mic_gr = gr.Audio(
source="microphone",
type="filepath",
info="Use your microphone to record audio",
label="Use Microphone for Reference",
)
use_mic_gr = gr.Checkbox(
label="Use Microphone",
value=False,
info="Notice: Microphone input may not work properly under traffic",
)
language = gr.Radio(['EN_NEWEST', 'EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN_NEWEST')
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
with gr.Column():
out_text_gr = gr.Text(label="Info")
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
ref_audio_gr = gr.Audio(label="Reference Audio Used")
gr.Examples(examples,
label="Examples",
inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
outputs=[out_text_gr, audio_gr, ref_audio_gr],
fn=predict,
cache_examples=False,)
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
demo.queue()
demo.launch(debug=True, show_api=True)