|
import gradio as gr |
|
import soundfile as sf |
|
import torch |
|
from transformers import AutoTokenizer |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from rubyinserter import add_ruby |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def gen(model_name: str, prompt: str, description: str, output_file_path: str) -> None: |
|
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model.name = model_name |
|
|
|
prompt = add_ruby(prompt) |
|
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) |
|
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) |
|
|
|
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
sf.write(output_file_path, audio_arr, model.config.sampling_rate) |
|
|
|
def generate_audio(model_name: str, prompt: str, description: str): |
|
output_file_path = "output.wav" |
|
gen(model_name, prompt, description, output_file_path) |
|
return output_file_path |
|
|
|
def main(): |
|
|
|
model_choices = [ |
|
"Atotti/parler-tts-mini-bate-voiceactress100-ex-ayuto", |
|
"Atotti/parler-tts-mini-bate-voiceactress100-ex-hiroki", |
|
"Atotti/parler-tts-mini-bate-voiceactress100-ex-olimov", |
|
"Atotti/parler-tts-mini-bate-voiceactress100-ex-mako", |
|
"Atotti/parler-tts-mini-bate-voiceactress100-ex-hinako" |
|
] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Text-to-Speech Demo") |
|
|
|
with gr.Row(): |
|
model_name_input = gr.Dropdown(choices=model_choices, label="Model Name") |
|
prompt_input = gr.Textbox(label="Prompt", placeholder="例: テキスト入力") |
|
description_input = gr.Textbox(label="Description", placeholder=f"例: Ayuto's voice delivers her words at a moderate speed with a quite monotone tone slightly low pitch in a confined environment. The pace of her speech is slow, resulting in a quite clear audio recording.") |
|
|
|
generate_button = gr.Button("Generate Audio") |
|
audio_output = gr.Audio(label="Generated Audio", type="filepath") |
|
|
|
|
|
generate_button.click( |
|
generate_audio, |
|
inputs=[model_name_input, prompt_input, description_input], |
|
outputs=audio_output |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = main() |
|
demo.launch() |
|
|