|
import gradio as gr |
|
import argparse |
|
import torchaudio |
|
from tts import StepAudioTTS |
|
from tokenizer import StepAudioTokenizer |
|
from datetime import datetime |
|
import os |
|
|
|
|
|
|
|
|
|
def tts_common(text, speaker, emotion, language, speed): |
|
text = ( |
|
(f"({emotion})" if emotion else "") |
|
+ (f"({language})" if language else "") |
|
+ (f"({speed})" if speed else "") |
|
+ text |
|
) |
|
output_audio, sr = tts_engine(text, speaker) |
|
return output_audio |
|
|
|
|
|
|
|
def tts_music(text_input_rap, speaker, mode_input): |
|
text_input_rap = f"({mode_input})" + text_input_rap |
|
output_audio, sr = tts_engine(text_input_rap, speaker) |
|
return output_audio |
|
|
|
|
|
|
|
def tts_clone(text, wav_file, speaker_prompt, emotion, language, speed): |
|
clone_speaker = { |
|
"wav_path": wav_file, |
|
"speaker": "custom_voice", |
|
"prompt_text": speaker_prompt, |
|
} |
|
clone_text = ( |
|
(f"({emotion})" if emotion else "") |
|
+ (f"({language})" if language else "") |
|
+ (f"({speed})" if speed else "") |
|
+ text |
|
) |
|
output_audio, sr = tts_engine(clone_text, "", clone_speaker) |
|
return output_audio |
|
|
|
|
|
def launch_demo(args): |
|
|
|
emotion_options = ["高兴1", "高兴2", "生气1", "生气2", "悲伤1", "撒娇1"] |
|
language_options = ["中文", "英文", "韩语", "日语", "四川话", "粤语", "广东话"] |
|
speed_options = ["慢速1", "慢速2", "快速1", "快速2"] |
|
speaker_options = ["Tingting", "nezha"] |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 🎙️ Step-Audio-TTS-3B Demo") |
|
|
|
|
|
with gr.Tab("Common TTS (普通语音合成)"): |
|
text_input = gr.Textbox( |
|
label="Input Text (输入文本)", |
|
) |
|
speaker_input = gr.Dropdown( |
|
speaker_options, |
|
label="Speaker Selection (音色选择)", |
|
) |
|
emotion_input = gr.Dropdown( |
|
emotion_options, |
|
label="Emotion Style (情感风格)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
language_input = gr.Dropdown( |
|
language_options, |
|
label="Language/Dialect (语言/方言)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
speed_input = gr.Dropdown( |
|
speed_options, |
|
label="Speech Rate (语速调节)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
submit_btn = gr.Button("🔊 Generate Speech (生成语音)") |
|
output_audio = gr.Audio( |
|
label="Output Audio (合成语音)", |
|
interactive=False, |
|
) |
|
|
|
submit_btn.click( |
|
tts_common, |
|
inputs=[ |
|
text_input, |
|
speaker_input, |
|
emotion_input, |
|
language_input, |
|
speed_input, |
|
], |
|
outputs=output_audio, |
|
) |
|
|
|
|
|
with gr.Tab("RAP/Humming Mode (RAP/哼唱模式)"): |
|
text_input_rap = gr.Textbox( |
|
label="Lyrics Input (歌词输入)", |
|
) |
|
speaker_input = gr.Dropdown( |
|
speaker_options, |
|
label="Speaker Selection (音色选择)", |
|
) |
|
mode_input = gr.Radio( |
|
["RAP", "Humming (哼唱)"], |
|
value="RAP", |
|
label="Generation Mode (生成模式)", |
|
) |
|
submit_btn_rap = gr.Button("🎤 Generate Performance (生成演绎)") |
|
output_audio_rap = gr.Audio( |
|
label="Performance Audio (演绎音频)", interactive=False |
|
) |
|
submit_btn_rap.click( |
|
tts_music, |
|
inputs=[text_input_rap, speaker_input, mode_input], |
|
outputs=output_audio_rap, |
|
) |
|
|
|
with gr.Tab("Voice Clone (语音克隆)"): |
|
text_input_clone = gr.Textbox( |
|
label="Target Text (目标文本)", |
|
placeholder="Text to be synthesized with cloned voice (待克隆语音合成的文本)", |
|
) |
|
audio_input = gr.File( |
|
label="Reference Audio Upload (参考音频上传)", |
|
) |
|
speaker_prompt = gr.Textbox( |
|
label="Exact text from reference audio (输入参考音频的准确文本)", |
|
) |
|
emotion_input = gr.Dropdown( |
|
emotion_options, |
|
label="Emotion Style (情感风格)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
language_input = gr.Dropdown( |
|
language_options, |
|
label="Language/Dialect (语言/方言)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
speed_input = gr.Dropdown( |
|
speed_options, |
|
label="Speech Rate (语速调节)", |
|
allow_custom_value=True, |
|
interactive=True, |
|
) |
|
submit_btn_clone = gr.Button("🗣️ Synthesize Cloned Speech (合成克隆语音)") |
|
output_audio_clone = gr.Audio( |
|
label="Cloned Speech Output (克隆语音输出)", |
|
interactive=False, |
|
) |
|
submit_btn_clone.click( |
|
tts_clone, |
|
inputs=[ |
|
text_input_clone, |
|
audio_input, |
|
speaker_prompt, |
|
emotion_input, |
|
language_input, |
|
speed_input, |
|
], |
|
outputs=output_audio_clone, |
|
) |
|
|
|
|
|
demo.queue().launch(server_name=args.server_name, server_port=args.server_port) |
|
|
|
|
|
if __name__ == "__main__": |
|
model_id = "stepfun-ai/Step-Audio-TTS-3B" |
|
tokeniers = "stepfun-ai/Step-Audio-Tokenizer" |
|
encoder = StepAudioTokenizer(tokeniers) |
|
tts_engine = StepAudioTTS(model_id, encoder) |
|
launch_demo() |