File size: 1,982 Bytes
bfdf354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
import random
import time

st.title('Multiply TTS Generator')

text = st.text_input(
    label="write your word or sentence",
    value="Hi,duino"
)

num_random_voices = st.number_input(
    label="Enter the number of random voices",
    min_value=1,
    value=1,
    step=1
)

output_filename = ""

def generate_speech():
    global output_filename

    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    inputs = processor(text=text, return_tensors="pt")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    total_voices = len(embeddings_dataset)

    random_voices = random.sample(range(total_voices), num_random_voices)

    combined_speech = []
    for index, voice_index in enumerate(random_voices):
        speaker_embeddings = torch.tensor(embeddings_dataset[voice_index]["xvector"]).unsqueeze(0)
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        combined_speech.extend(speech.numpy())

        if index != len(random_voices) - 1:
            # Add a pause of 2 seconds between voices
            pause_samples = int(16000 * 2)  # 2 seconds at 16kHz sample rate
            pause = torch.zeros(pause_samples)
            combined_speech.extend(pause)

    output_filename = "_".join(text.split()) + "_speech.wav"
    sf.write(output_filename, combined_speech, samplerate=16000)

if st.button("Generate"):
    generate_speech()
    audio_file = open(output_filename, 'rb')
    audio_bytes = audio_file.read()
    st.audio(audio_bytes, format="audio/wav")
    st.write("Speech generated and saved as: " + output_filename)