Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree | |
import subprocess | |
command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app" | |
subprocess.check_output(command_to_run, shell=True, text=True) | |
import gradio as gr | |
import os | |
import inference | |
SUPPORTED_SPEAKERS = { | |
"Cori Samuel":"hifitts_92", | |
"Phil Benson":"hifitts_6097", | |
"Mike Pelton":"hifitts_6670", | |
"Tony Oliva":"hifitts_6671", | |
"Maria Kasper":"hifitts_8051", | |
"John Van Stan":"hifitts_9017", | |
"Helen Taylor":"hifitts_9136", | |
"Sylviamb":"hifitts_11614", | |
"Celine Major":"hifitts_11697", | |
"LikeManyWaters":"hifitts_12787" | |
} | |
def tts_inference( | |
input_text, | |
target_speaker, | |
duration | |
): | |
### Target Speaker ### | |
target_speaker = SUPPORTED_SPEAKERS[target_speaker] | |
args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] | |
args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] | |
args_list += ["--speaker_name_1", target_speaker] | |
args_list += ["--speaker_name_2", None] | |
args_list += ["--text", input_text] | |
args_list += ["--mode","single"] | |
args_list += ["--duration_control",str(float(2.05-duration))] | |
args_list += ["--output_dir", "result"] | |
args_list += ["--log_level", "debug"] | |
os.environ["WORK_DIR"] = "./" | |
inference.main(args_list) | |
### Display ### | |
result_file = os.path.join( | |
"result/single/test_pred.wav" | |
) | |
return result_file | |
def tc_inference( | |
input_text, | |
target_speaker_1, | |
target_speaker_2, | |
confusion_degree, | |
duration | |
): | |
### Target Speaker ### | |
target_speaker_1 = SUPPORTED_SPEAKERS[target_speaker_1] | |
if target_speaker_2 is not None: | |
target_speaker_2 = SUPPORTED_SPEAKERS[target_speaker_2] | |
args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] | |
args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] | |
args_list += ["--speaker_name_1", target_speaker_1] | |
args_list += ["--speaker_name_2", target_speaker_2] | |
args_list += ["--alpha", str(float(confusion_degree))] | |
args_list += ["--text", input_text] | |
args_list += ["--mode","single"] | |
args_list += ["--duration_control",str(float(2.05-duration))] | |
args_list += ["--output_dir", "result"] | |
args_list += ["--log_level", "debug"] | |
os.environ["WORK_DIR"] = "./" | |
inference.main(args_list) | |
### Display ### | |
source_speaker_1 = os.path.join( | |
"result/single/s1.wav" | |
) | |
source_speaker_2 = os.path.join( | |
"result/single/s2.wav" | |
) | |
result_file = os.path.join( | |
"result/single/test_pred.wav" | |
) | |
return source_speaker_1, source_speaker_2, result_file | |
# Section 1: TTS | |
tts_demo_inputs = [ | |
gr.Textbox( | |
label="Input Text", | |
type="text", | |
placeholder="Type something here.." | |
), | |
gr.Radio( | |
choices=list(SUPPORTED_SPEAKERS.keys()), | |
label="Target Speaker", | |
value="Cori Samuel" | |
), | |
gr.Slider( | |
0.1, | |
2, | |
value=1, | |
step=0.05, | |
label="Speaking Rate", | |
info="As the step number increases, the speaking rate will be faster.", | |
) | |
] | |
tts_demo_output = gr.Audio(label="Generated Speech") | |
# Section 2: Timbre confusion | |
tc_demo_inputs = [ | |
gr.Textbox( | |
label="Input Text", | |
type="text", | |
placeholder="Type something here.." | |
), | |
gr.Radio( | |
choices=list(SUPPORTED_SPEAKERS.keys()), | |
label="Target Speaker 1", | |
value="Cori Samuel" | |
), | |
gr.Radio( | |
choices=list(SUPPORTED_SPEAKERS.keys()), | |
label="Target Speaker 2", | |
value="Phil Benson" | |
), | |
gr.Slider( | |
0, | |
1, | |
value=0.5, | |
step=0.1, | |
label="Fusion Degree", | |
info="As the step number increases, the generated voice will be more similar to speaker 2.", | |
), | |
gr.Slider( | |
0.1, | |
2, | |
value=1, | |
step=0.05, | |
label="Speaking Rate", | |
info="As the step number increases, the speaking rate will be faster.", | |
) | |
] | |
tc_demo_outputs = [ | |
gr.Audio(label="Target Speaker 1"), | |
gr.Audio(label="Target Speaker 2"), | |
gr.Audio(label="Interpolated Speech") | |
] | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=tts_inference, | |
inputs=tts_demo_inputs, | |
outputs=tts_demo_output, | |
title="Amphion Text-to-Speech", | |
description="This demo offers an Amphion TTS pretrained model (VITS) for you to explore." | |
) | |
gr.Interface( | |
fn=tc_inference, | |
inputs=tc_demo_inputs, | |
outputs=tc_demo_outputs, | |
title="Voice Fusion", | |
description="In this section, you can choose two speakers to create a voice mix. Adjust the ‘Fusion Degree’ slider to customize your desired mix ratio between the two speakers." | |
) | |
demo.queue() | |
demo.launch() | |
# if __name__ == "__main__": | |
# demo.launch(share=True) |