File size: 5,442 Bytes
b72ab63
 
 
 
 
 
 
02cd175
b72ab63
02cd175
b72ab63
 
 
02cd175
b72ab63
02cd175
95831e8
 
 
4c58375
b72ab63
 
 
396e7de
b72ab63
 
 
 
 
 
 
396e7de
b72ab63
 
 
 
396e7de
b72ab63
 
8c070ea
b72ab63
8c070ea
396e7de
8c070ea
5dbaa00
b72ab63
 
396e7de
 
 
b72ab63
 
 
 
396e7de
 
 
b72ab63
 
396e7de
b72ab63
 
396e7de
 
b72ab63
 
396e7de
 
b72ab63
396e7de
 
 
b72ab63
396e7de
b72ab63
396e7de
 
b72ab63
 
 
 
 
 
396e7de
b72ab63
396e7de
 
b72ab63
 
 
02cd175
9c0d38a
b72ab63
 
 
 
 
 
 
 
 
 
02cd175
b72ab63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c070ea
b72ab63
02cd175
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import torch
import argparse
import gradio as gr
import langid
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
from dotenv import load_dotenv

# Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("--share", action='store_true', default=False, help="make link public")
args = parser.parse_args()
load_dotenv()

# Paths and device setup
en_ckpt_base = 'base_speakers/EN'
zh_ckpt_base = 'base_speakers/ZH'
ckpt_converter = 'converter'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

# Load models
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

# Load speaker embeddings
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)

# Supported languages
supported_languages = ['zh', 'en']

def predict(prompt, style, audio_file_pth):
    text_hint = ''

    # Detect the input language
    language_predicted = langid.classify(prompt)[0].strip()
    print(f"Detected language: {language_predicted}")

    if language_predicted not in supported_languages:
        text_hint += f"[ERROR] The detected language {language_predicted} is not supported. Supported languages: {supported_languages}\n"
        return text_hint, None, None

    if language_predicted == "zh":
        tts_model = zh_base_speaker_tts
        source_se = zh_source_se
        language = 'Chinese'
        if style != 'default':
            text_hint += f"[ERROR] The style {style} is not supported for Chinese. Supported style: 'default'\n"
            return text_hint, None, None
    else:
        tts_model = en_base_speaker_tts
        source_se = en_source_default_se if style == 'default' else en_source_style_se
        language = 'English'
        if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
            text_hint += f"[ERROR] The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
            return text_hint, None, None

    if len(prompt) < 2:
        text_hint += "[ERROR] Please provide a longer prompt text.\n"
        return text_hint, None, None
    if len(prompt) > 200:
        text_hint += "[ERROR] Text length limited to 200 characters. Please try shorter text.\n"
        return text_hint, None, None

    try:
        target_se, audio_name = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir='processed', vad=True)
    except Exception as e:
        text_hint += f"[ERROR] Error extracting tone color: {str(e)}\n"
        return text_hint, None, None

    src_path = f'{output_dir}/tmp.wav'
    tts_model.tts(prompt, src_path, speaker=style, language=language)

    save_path = f'{output_dir}/output.wav'
    encode_message = "@MyShell"
    tone_color_converter.convert(audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message)

    text_hint += "Response generated successfully.\n"
    return text_hint, save_path, audio_file_pth

title = "MyShell OpenVoice"

# Gradio interface setup
with gr.Blocks(gr.themes.Glass()) as demo:
    with gr.Row():
        with gr.Column():
            input_text_gr = gr.Textbox(
                label="Text Prompt",
                info="One or two sentences at a time is better. Up to 200 text characters.",
                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
            )
            style_gr = gr.Dropdown(
                label="Style",
                choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
                info="Please upload a reference audio file that is atleast 1 minute long. For best results, ensure the audio is clear. You can use Adobe Podcast Enhance(https://podcast.adobe.com/enhance) to improve the audio quality before uploading.",
                max_choices=1,
                value="default",
            )
            ref_gr = gr.Audio(
                label="Reference Audio",
                type="filepath",
                value="resources/demo_speaker2.mp3",
            )
            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)

        with gr.Column():
            out_text_gr = gr.Text(label="Info")
            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
            ref_audio_gr = gr.Audio(label="Reference Audio Used")

            tts_button.click(predict, [input_text_gr, style_gr, ref_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])

demo.queue()
demo.launch(debug=True, show_api=False, share=args.share)

# Hide Gradio footer
css = "footer {visibility: hidden}"