Spaces:
Running
Running
File size: 6,028 Bytes
b72ab63 95831e8 4c58375 b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 8c070ea b72ab63 8c070ea 396e7de 8c070ea 5dbaa00 b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 396e7de b72ab63 8c070ea b72ab63 8c070ea b72ab63 5dbaa00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import torch
import argparse
import gradio as gr
from zipfile import ZipFile
import langid
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
parser = argparse.ArgumentParser()
parser.add_argument("--share", action='store_true', default=False, help="make link public")
args = parser.parse_args()
en_ckpt_base = 'base_speakers/EN'
zh_ckpt_base = 'base_speakers/ZH'
ckpt_converter = 'converter'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)
# Load models
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
# Load speaker embeddings
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
# Supported languages
supported_languages = ['zh', 'en']
def predict(prompt, style, audio_file_pth):
text_hint = ''
# Detect the input language
language_predicted = langid.classify(prompt)[0].strip()
print(f"Detected language: {language_predicted}")
if language_predicted not in supported_languages:
text_hint += f"[ERROR] The detected language {language_predicted} is not supported. Supported languages: {supported_languages}\n"
gr.Warning(f"The detected language {language_predicted} is not supported. Supported languages: {supported_languages}")
return text_hint, None, None
if language_predicted == "zh":
tts_model = zh_base_speaker_tts
source_se = zh_source_se
language = 'Chinese'
if style != 'default':
text_hint += f"[ERROR] The style {style} is not supported for Chinese. Supported style: 'default'\n"
gr.Warning(f"The style {style} is not supported for Chinese. Supported style: 'default'")
return text_hint, None, None
else:
tts_model = en_base_speaker_tts
source_se = en_source_default_se if style == 'default' else en_source_style_se
language = 'English'
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
text_hint += f"[ERROR] The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
gr.Warning(f"The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
return text_hint, None, None
if len(prompt) < 2:
text_hint += "[ERROR] Please provide a longer prompt text.\n"
gr.Warning("Please provide a longer prompt text.")
return text_hint, None, None
if len(prompt) > 200:
text_hint += "[ERROR] Text length limited to 200 characters. Please try shorter text.\n"
gr.Warning("Text length limited to 200 characters. Please try shorter text.")
return text_hint, None, None
try:
target_se, audio_name = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir='processed', vad=True)
except Exception as e:
text_hint += f"[ERROR] Error extracting tone color: {str(e)}\n"
gr.Warning(f"[ERROR] Error extracting tone color: {str(e)}")
return text_hint, None, None
src_path = f'{output_dir}/tmp.wav'
tts_model.tts(prompt, src_path, speaker=style, language=language)
save_path = f'{output_dir}/output.wav'
encode_message = "@MyShell"
tone_color_converter.convert(audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message)
text_hint += "Response generated successfully.\n"
return text_hint, save_path, audio_file_pth
title = "MyShell OpenVoice"
class WhiteTheme(gr.themes.Base):
def __init__(self):
super().__init__()
self.body_background = "white"
self.block_background = "white"
self.text_color = "black"
self.secondary_text_color = "black"
with gr.Blocks(theme=WhiteTheme()) as demo:
with gr.Row():
with gr.Column():
input_text_gr = gr.Textbox(
label="Text Prompt",
info="One or two sentences at a time is better. Up to 200 text characters.",
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
)
style_gr = gr.Dropdown(
label="Style",
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
info="Please upload a reference audio file, it should be 1 minute long and clear.",
max_choices=1,
value="default",
)
ref_gr = gr.Audio(
label="Reference Audio",
type="filepath",
value="resources/demo_speaker2.mp3",
)
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
with gr.Column():
out_text_gr = gr.Text(label="Info")
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
ref_audio_gr = gr.Audio(label="Reference Audio Used")
tts_button.click(predict, [input_text_gr, style_gr, ref_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
demo.queue()
demo.launch(debug=True, show_api=True, share=args.share)
|