Spaces:
Paused
Paused
File size: 5,294 Bytes
96094ed 930b057 52ea9a4 930b057 52ea9a4 930b057 9e064db 930b057 f251112 930b057 f251112 930b057 f251112 96094ed 930b057 96094ed 930b057 e4f5d4b 930b057 f251112 96094ed 6747ea1 930b057 4dbd274 930b057 4dbd274 f251112 930b057 6747ea1 96094ed 930b057 96094ed 930b057 f251112 930b057 f251112 96094ed 6747ea1 f251112 6747ea1 930b057 96094ed 930b057 f251112 96094ed 5b1172f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import torch
from transformers import pipeline
import numpy as np
import gradio as gr
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return device
device = _grab_best_device()
default_model_per_language = {
"english": "kakao-enterprise/vits-ljs",
"spanish": "facebook/mms-tts-spa",
"tamil": "facebook/mms-tts-tam"
}
models_per_language = {
"english": [
"ylacombe/vits_ljs_irish_male_monospeaker",
"ylacombe/vits_ljs_irish_male_monospeaker_2",
"ylacombe/vits_ljs_irish_male_monospeaker_2",
"ylacombe/vits_ljs_irish_male_2",
"ylacombe/vits_ljs_welsh_female_monospeaker",
"ylacombe/vits_ljs_welsh_female_monospeaker_2",
"ylacombe/vits_ljs_welsh_female_2",
"ylacombe/vits_ljs_welsh_male_monospeaker",
"ylacombe/vits_ljs_welsh_male_monospeaker_2",
"ylacombe/vits_ljs_scottish_female_monospeaker",
"ylacombe/vits_ljs_scottish_female_2",
],
"spanish": [
"ylacombe/mms-spa-finetuned-chilean-monospeaker-all",
"ylacombe/mms-spa-finetuned-chilean-monospeaker",
],
"tamil": [
"ylacombe/mms-tam-finetuned-monospeaker-all",
"ylacombe/mms-tam-finetuned-monospeaker",
]
}
HUB_PATH = "ylacombe/vits_ljs_welsh_female_monospeaker"
pipe_dict = {
"current_model": "ylacombe/vits_ljs_welsh_female_monospeaker",
"pipe": pipeline("text-to-speech", model=HUB_PATH, device=0),
"original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
"language": "english",
}
title = "# 🐶 VITS"
max_speakers = 15
description = """
"""
# Inference
def generate_audio(text, model_id, language):
if pipe_dict["language"] != language:
gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
pipe_dict["language"] = language
pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
if pipe_dict["current_model"] != model_id:
gr.Warning("Model has changed - loading new model")
pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=0)
pipe_dict["current_model"] = model_id
num_speakers = pipe_dict["pipe"].model.config.num_speakers
out = []
# first generate original model result
output = pipe_dict["original_pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
visible=True)
out.append(output)
if num_speakers>1:
for i in range(min(num_speakers, max_speakers - 1)):
forward_params = {"speaker_id": i}
output = pipe_dict["pipe"](text, forward_params=forward_params)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
else:
output = pipe_dict["pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-2))
return out
# Gradio blocks demo
with gr.Blocks() as demo_blocks:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
with gr.Column():
inp_text = gr.Textbox(label="Input Text", info="What would you like VITS to synthesise?")
btn = gr.Button("Generate Audio!")
language = gr.Dropdown(
default_model_per_language.keys(),
value = "english",
label = "language",
info = "Language that you want to test"
)
model_id = gr.Dropdown(
models_per_language["english"],
value="ylacombe/vits_ljs_welsh_female_monospeaker_2",
label="Model",
info="Model you want to test",
)
with gr.Column():
outputs = []
for i in range(max_speakers):
out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
outputs.append(out_audio)
language.change(lambda language: gr.Dropdown(
models_per_language[language],
value=models_per_language[language][0],
label="Model",
info="Model you want to test",
),
language,
model_id
)
btn.click(generate_audio, [inp_text, model_id, language], outputs)
demo_blocks.queue().launch() |