Spaces:
Running
on
T4
Running
on
T4
import spaces | |
import tempfile | |
import gradio as gr | |
import os | |
from whisperspeech.pipeline import Pipeline | |
import torch | |
import soundfile as sf | |
import numpy as np | |
import torch.nn.functional as F | |
from whisperspeech.languages import LANGUAGES | |
from whisperspeech.pipeline import Pipeline | |
from whisperspeech.utils import resampler | |
title = """# 🙋🏻♂️ Welcome to🌟Tonic's🌬️💬📝WhisperSpeech | |
You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co./collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable. | |
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co./spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3> | |
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co./TeamTonic) & [MultiTransformer](https://huggingface.co./MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 | |
""" | |
def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text): | |
pipe = Pipeline() | |
speaker_url = None | |
if speaker_audio is not None: | |
speaker_url = speaker_audio | |
if mix_lang and mix_text: | |
mixed_langs = lang.split(',') + mix_lang.split(',') | |
mixed_texts = [text] + mix_text.split(',') | |
stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs) | |
audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0]) | |
else: | |
audio_data = pipe.generate(text, speaker_url, lang) | |
resample_audio = resampler(newsr=24000) | |
audio_data_resampled = next(resample_audio([{'sample_rate': 22050, 'samples': audio_data.cpu()}]))['samples_24k'] | |
# Normalize and write to a WAV file | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: | |
tmp_file_name = tmp_file.name | |
audio_np = audio_data_resampled.numpy() # Convert to numpy array | |
# Normalize if necessary | |
if audio_np.max() > 1.0 or audio_np.min() < -1.0: | |
audio_np = audio_np / np.max(np.abs(audio_np)) | |
# Ensure the audio data is 2D (num_samples, num_channels) | |
if audio_np.ndim == 1: | |
audio_np = np.expand_dims(audio_np, axis=1) | |
# Write the file | |
sf.write(tmp_file_name, audio_np, 24000) | |
return tmp_file_name | |
with gr.Blocks() as demo: | |
gr.Markdown(title) | |
with gr.Tabs(): | |
with gr.TabItem("🌬️💬📝Standard TTS"): | |
with gr.Row(): | |
text_input_standard = gr.Textbox(label="Enter text") | |
lang_input_standard = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language") | |
speaker_input_standard = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath") | |
placeholder_mix_lang = gr.Textbox(visible=False) # Placeholder, hidden | |
placeholder_mix_text = gr.Textbox(visible=False) # Placeholder, hidden | |
generate_button_standard = gr.Button("Generate Speech") | |
output_audio_standard = gr.Audio(label="🌬️💬📝WhisperSpeech") | |
generate_button_standard.click( | |
whisper_speech_demo, | |
inputs=[text_input_standard, lang_input_standard, speaker_input_standard, placeholder_mix_lang, placeholder_mix_text], | |
outputs=output_audio_standard | |
) | |
with gr.TabItem("🌬️💬📝Mixed Language TTS"): | |
with gr.Row(): | |
placeholder_text_input = gr.Textbox(visible=False) # Placeholder, hidden | |
placeholder_lang_input = gr.Dropdown(choices=[], visible=False) # Placeholder, hidden | |
placeholder_speaker_input = gr.Audio(visible=False) | |
mix_lang_input_mixed = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages") | |
mix_text_input_mixed = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, Cześć") | |
generate_button_mixed = gr.Button("Generate Mixed Speech") | |
output_audio_mixed = gr.Audio(label="Mixed🌬️💬📝WhisperSpeech") | |
generate_button_mixed.click( | |
whisper_speech_demo, | |
inputs=[placeholder_text_input, placeholder_lang_input, placeholder_speaker_input, mix_lang_input_mixed, mix_text_input_mixed], | |
outputs=output_audio_mixed | |
) | |
demo.launch() |