import spaces import gradio as gr import io import os import re import torch import torchaudio from pathlib import Path from whisperspeech.pipeline import Pipeline DEVEL=os.environ.get('DEVEL', False) title = """

WhisperSpeech banner with Collabora and LAION logos

# Welcome to Collabora's WhisperSpeech WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper. The model is fully open and you can run it on your local hardware. It's like **Stable Diffusion but for speech** – both powerful and easily customizable. [You can contribute to WhisperSpeech on Github.](https://github.com/collabora/WhisperSpeech) You can also join the discussion on Discord [![](https://dcbadge.vercel.app/api/server/FANw4rHD5E)](https://discord.gg/FANw4rHD5E) Huge thanks to [Tonic](https://huggingface.co./Tonic) who helped build this Space for WhisperSpeech. ### How to Use It Write you text in the box, you can use language tags (`` or ``) to create multilingual speech. Optionally you can upload a speech sample or give it a file URL to clone an existing voice. Check out the examples at the bottom of the page for inspiration. """ footer = """ ### How to use it locally ``` pip install -U WhisperSpeech ``` Afterwards: ``` from whisperspeech.pipeline import Pipeline pipe = Pipeline(torch_compile=True) pipe.generate_to_file("output.wav", "Hello from WhisperSpeech.") ``` """ text_examples = [ ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None], ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"], ["To jest pierwszy test wielojęzycznego Whisper Speech , modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze Jewels.", None], [" WhisperSpeech is an Open Source library that helps you convert text to speech. Teraz także po Polsku! I think I just tried saying \"now also in Polish\", don't judge me...", None], # [" WhisperSpeech is multi-lingual y puede cambiar de idioma मध्य वाक्य में"], ["To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None], # [" The big difference between Europe et les Etats Unis jest to, że mamy tak wiele języków тут, в Європі"] ] def parse_multilingual_text(input_text): pattern = r"(?:<(\w+)>)|([^<]+)" cur_lang = 'en' segments = [] for i, (lang, txt) in enumerate(re.findall(pattern, input_text)): if lang: cur_lang = lang else: segments.append((cur_lang, f" {txt} ")) # add spaces to give it some time to switch languages if not segments: return [("en", "")] return segments @spaces.GPU(enable_queue=True) def generate_audio(pipe, segments, speaker, speaker_url, cps=14): if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker) elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url) else: speaker = pipe.default_speaker langs, texts = [list(x) for x in zip(*segments)] print(texts, langs) stoks = pipe.t2s.generate(texts, cps=cps, lang=langs) stoks = stoks[stoks!=512] atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0)) audio = pipe.vocoder.decode(atoks) return audio.cpu() def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14): if len(multilingual_text) == 0: raise gr.Error("Please enter some text for me to speak!") segments = parse_multilingual_text(multilingual_text) audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps) return (24000, audio.T.numpy()) # Did not work for me in Safari: # mp3 = io.BytesIO() # torchaudio.save(mp3, audio, 24000, format='mp3') # return mp3.getvalue() pipe = Pipeline(torch_compile=not DEVEL) # warmup will come from regenerating the examples with gr.Blocks() as demo: gr.Markdown(title) with gr.Row(equal_height=True): with gr.Column(scale=2): text_input = gr.Textbox(label="Enter multilingual text💬📝", value=text_examples[0][0], info="You can use `` for English and `` for Polish, see examples below.") cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25, label="Tempo (in characters per second)") with gr.Row(equal_height=True): speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬", sources=["upload", "microphone"], type='filepath') url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:") gr.Markdown(" \n ") # fixes the bottom overflow from Audio generate_button = gr.Button("Try Collabora's WhisperSpeech🌟") with gr.Column(scale=1): output_audio = gr.Audio(label="WhisperSpeech says…") with gr.Column(): gr.Markdown("### Try these examples to get started !🌟🌬️") gr.Examples( examples=text_examples, inputs=[text_input, url_input], outputs=[output_audio], fn=whisper_speech_demo, cache_examples=not DEVEL, ) generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio) gr.Markdown(footer) demo.launch(server_port=3000 if DEVEL else None)