openvoice2 / openvoice_app.py
poemsforaphrodite's picture
Upload openvoice_app.py with huggingface_hub
02cd175 verified
raw
history blame
5.44 kB
import os
import torch
import argparse
import gradio as gr
import langid
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
from dotenv import load_dotenv
# Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("--share", action='store_true', default=False, help="make link public")
args = parser.parse_args()
load_dotenv()
# Paths and device setup
en_ckpt_base = 'base_speakers/EN'
zh_ckpt_base = 'base_speakers/ZH'
ckpt_converter = 'converter'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)
# Load models
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
# Load speaker embeddings
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
# Supported languages
supported_languages = ['zh', 'en']
def predict(prompt, style, audio_file_pth):
text_hint = ''
# Detect the input language
language_predicted = langid.classify(prompt)[0].strip()
print(f"Detected language: {language_predicted}")
if language_predicted not in supported_languages:
text_hint += f"[ERROR] The detected language {language_predicted} is not supported. Supported languages: {supported_languages}\n"
return text_hint, None, None
if language_predicted == "zh":
tts_model = zh_base_speaker_tts
source_se = zh_source_se
language = 'Chinese'
if style != 'default':
text_hint += f"[ERROR] The style {style} is not supported for Chinese. Supported style: 'default'\n"
return text_hint, None, None
else:
tts_model = en_base_speaker_tts
source_se = en_source_default_se if style == 'default' else en_source_style_se
language = 'English'
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
text_hint += f"[ERROR] The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
return text_hint, None, None
if len(prompt) < 2:
text_hint += "[ERROR] Please provide a longer prompt text.\n"
return text_hint, None, None
if len(prompt) > 200:
text_hint += "[ERROR] Text length limited to 200 characters. Please try shorter text.\n"
return text_hint, None, None
try:
target_se, audio_name = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir='processed', vad=True)
except Exception as e:
text_hint += f"[ERROR] Error extracting tone color: {str(e)}\n"
return text_hint, None, None
src_path = f'{output_dir}/tmp.wav'
tts_model.tts(prompt, src_path, speaker=style, language=language)
save_path = f'{output_dir}/output.wav'
encode_message = "@MyShell"
tone_color_converter.convert(audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message)
text_hint += "Response generated successfully.\n"
return text_hint, save_path, audio_file_pth
title = "MyShell OpenVoice"
# Gradio interface setup
with gr.Blocks(gr.themes.Glass()) as demo:
with gr.Row():
with gr.Column():
input_text_gr = gr.Textbox(
label="Text Prompt",
info="One or two sentences at a time is better. Up to 200 text characters.",
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
)
style_gr = gr.Dropdown(
label="Style",
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
info="Please upload a reference audio file that is atleast 1 minute long. For best results, ensure the audio is clear. You can use Adobe Podcast Enhance(https://podcast.adobe.com/enhance) to improve the audio quality before uploading.",
max_choices=1,
value="default",
)
ref_gr = gr.Audio(
label="Reference Audio",
type="filepath",
value="resources/demo_speaker2.mp3",
)
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
with gr.Column():
out_text_gr = gr.Text(label="Info")
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
ref_audio_gr = gr.Audio(label="Reference Audio Used")
tts_button.click(predict, [input_text_gr, style_gr, ref_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
demo.queue()
demo.launch(debug=True, show_api=False, share=args.share)
# Hide Gradio footer
css = "footer {visibility: hidden}"