Spaces:
Paused
Paused
File size: 6,745 Bytes
96094ed 930b057 e9c1685 cf24559 1141fd4 930b057 e9c1685 930b057 f251112 e9c1685 930b057 f251112 96094ed ad7e686 aeeb0c4 ad7e686 aeeb0c4 51ad4f8 be89a6c 96094ed e9c1685 96094ed e9c1685 930b057 e9c1685 e4f5d4b 930b057 f251112 526db04 f251112 96094ed 6747ea1 930b057 e9c1685 930b057 e9c1685 f251112 e9c1685 6747ea1 96094ed ad7e686 96094ed ad7e686 865af48 96094ed 92dd64b 96094ed 930b057 cfcb145 c5cad70 930b057 96094ed 6747ea1 f251112 6747ea1 ad7e686 c5cad70 ad7e686 c5cad70 865af48 c5cad70 865af48 b44f87d 865af48 ad7e686 e9c1685 f251112 96094ed 5b1172f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import torch
from transformers import pipeline
import numpy as np
import gradio as gr
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return device
device = _grab_best_device()
default_model_per_language = {
"english": "kakao-enterprise/vits-ljs",
"spanish": "facebook/mms-tts-spa",
}
models_per_language = {
"english": [
("Welsh Female Speaker", "ylacombe/vits_ljs_welsh_female_monospeaker_2"),
("Welsh Male Speaker", "ylacombe/vits_ljs_welsh_male_monospeaker_2"),
("Scottish Female Speaker", "ylacombe/vits_ljs_scottish_female_monospeaker"),
("Northern Female Speaker", "ylacombe/vits_ljs_northern_female_monospeaker"),
("Midlands Male Speaker", "ylacombe/vits_ljs_midlands_male_monospeaker"),
("Southern Male Speaker", "ylacombe/vits_ljs_southern_male_monospeaker"),
("Irish Male Speaker", "ylacombe/vits_ljs_irish_male_monospeaker_2"),
],
"spanish": [
("Male Chilean Speaker", "ylacombe/mms-spa-finetuned-chilean-monospeaker"),
("Female Argentinian Speaker", "ylacombe/mms-spa-finetuned-argentinian-monospeaker"),
("Male Colombian Speaker", "ylacombe/mms-spa-finetuned-colombian-monospeaker"),
],
}
pipe_dict = {
"pipe": [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]],
"original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
"language": "english",
}
title = """# Explore English and Spanish Accents with VITS finetuning
## Or how the best wine comes in old bottles
[VITS](https://huggingface.co./docs/transformers/model_doc/vits) is a light weight, low-latency TTS model.
Coupled with the right data and the right training recipe, you can get an excellent finetuned version in **20 minutes** with as little as **80 to 150 samples**.
Training recipe available in this [github repository](https://github.com/ylacombe/finetune-hf-vits)!
"""
max_speakers = 15
# Inference
def generate_audio(text, language):
if pipe_dict["language"] != language:
gr.Warning(f"Language has changed - loading corresponding models: {default_model_per_language[language]}")
pipe_dict["language"] = language
pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
pipe_dict["pipe"] = [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language[language]]
out = []
# first generate original model result
output = pipe_dict["original_pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {default_model_per_language[language]}", show_label=True,
visible=True)
out.append(output)
for i in range(min(len(pipe_dict["pipe"]), max_speakers - 1)):
output = pipe_dict["pipe"][i](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Finetuned {models_per_language[language][i][0]}", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
return out
css = """
#container{
margin: 0 auto;
max-width: 80rem;
}
#intro{
max-width: 100%;
text-align: center;
margin: 0 auto;
}
"""
# Gradio blocks demo
with gr.Blocks(css=css) as demo_blocks:
gr.Markdown(title, elem_id="intro")
with gr.Row():
with gr.Column():
inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
btn = gr.Button("Generate Audio!")
language = gr.Dropdown(
default_model_per_language.keys(),
value = "english",
label = "language",
info = "Language that you want to test"
)
with gr.Column():
outputs = []
for i in range(max_speakers):
out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
outputs.append(out_audio)
with gr.Accordion("Datasets and models details"):
gr.Markdown("""
### English
* **Model**: [VITS-ljs](https://huggingface.co./kakao-enterprise/vits-ljs)
* **Dataset**: [British Isles Accent](https://huggingface.co./datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co./kakao-enterprise/vits-ljs).
### Spanish
* **Model**: [Spanish MMS TTS](https://huggingface.co./facebook/mms-tts-spa). This model is part of Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project, aiming to
provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co./models?sort=trending&search=facebook%2Fmms-tts).
* **Datasets**: For each accent, we used 100 to 150 samples of a single speaker to finetune the model.
- [Colombian Spanish TTS dataset](https://huggingface.co./datasets/ylacombe/google-colombian-spanish).
- [Argentinian Spanish TTS dataset](https://huggingface.co./datasets/ylacombe/google-argentinian-spanish).
- [Chilean Spanish TTS dataset](https://huggingface.co./datasets/ylacombe/google-chilean-spanish).
""")
with gr.Accordion("Run VITS and MMS with transformers", open=False):
gr.Markdown(
"""
```bash
pip install transformers
```
```py
from transformers import pipeline
import scipy
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
# write to a wav file
scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
```
"""
)
btn.click(generate_audio, [inp_text, language], outputs)
demo_blocks.queue().launch() |