import streamlit as st from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import torch import soundfile as sf import random import time st.title('Multiply TTS Generator') text = st.text_input( label="write your word or sentence", value="Hi,duino" ) num_random_voices = st.number_input( label="Enter the number of random voices", min_value=1, value=1, step=1 ) output_filename = "" def generate_speech(): global output_filename processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") inputs = processor(text=text, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") total_voices = len(embeddings_dataset) random_voices = random.sample(range(total_voices), num_random_voices) combined_speech = [] for index, voice_index in enumerate(random_voices): speaker_embeddings = torch.tensor(embeddings_dataset[voice_index]["xvector"]).unsqueeze(0) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) combined_speech.extend(speech.numpy()) if index != len(random_voices) - 1: # Add a pause of 2 seconds between voices pause_samples = int(16000 * 2) # 2 seconds at 16kHz sample rate pause = torch.zeros(pause_samples) combined_speech.extend(pause) output_filename = "_".join(text.split()) + "_speech.wav" sf.write(output_filename, combined_speech, samplerate=16000) if st.button("Generate"): generate_speech() audio_file = open(output_filename, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format="audio/wav") st.write("Speech generated and saved as: " + output_filename)