File size: 3,966 Bytes
0065d28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import torch
import numpy as np
import librosa
import soundfile as sf
import streamlit as st
from tqdm import tqdm
from speechbrain.pretrained import Tacotron2, HIFIGAN

# Paths
output_path = "./processed_data/"
os.makedirs(output_path, exist_ok=True)

# Preprocessing Function
def preprocess_audio(audio_path, max_length=1000):
    """
    Preprocess the audio file to generate mel spectrogram with uniform length.
    """
    wav, sr = librosa.load(audio_path, sr=24000)
    mel_spectrogram = librosa.feature.melspectrogram(
        y=wav, sr=sr, n_fft=2048, hop_length=256, n_mels=120
    )
    mel_spectrogram = np.log(np.maximum(1e-5, mel_spectrogram))  # Log normalization

    # Ensure all mel spectrograms have the same time dimension
    if mel_spectrogram.shape[1] > max_length:  # Truncate
        mel_spectrogram = mel_spectrogram[:, :max_length]
    else:  # Pad
        padding = max_length - mel_spectrogram.shape[1]
        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, padding)), mode="constant")

    return mel_spectrogram

# Function to Split Long Text into Chunks
def split_text_into_chunks(text, max_chunk_length=200):
    """
    Splits the input text into smaller chunks, each of up to `max_chunk_length` characters.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > max_chunk_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1  # Account for space

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Generate Speech for Long Text
def generate_speech(text, tacotron2, hifi_gan, output_file="long_speech.wav", sample_rate=24000):
    """
    Generates a long speech by splitting the text into chunks, generating audio for each,
    and concatenating the waveforms.
    """
    chunks = split_text_into_chunks(text)
    waveforms = []

    for chunk in tqdm(chunks, desc="Generating speech"):
        text_input = [str(chunk)]
        mel_output, mel_length, alignment = tacotron2.encode_batch(text_input)
        waveform = hifi_gan.decode_batch(mel_output)
        waveforms.append(waveform.squeeze().cpu().numpy())

    # Concatenate waveforms
    long_waveform = np.concatenate(waveforms, axis=0)

    # Save the concatenated audio
    sf.write(output_file, long_waveform, sample_rate)
    print(f"Audio has been synthesized and saved as '{output_file}'.")

# Load Pretrained Tacotron2 and HiFi-GAN
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tacotron2")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_hifigan")

# Fine-tuned model (if available)
if os.path.exists("indic_accent_tacotron2.pth"):
    tacotron2.load_state_dict(torch.load("indic_accent_tacotron2.pth"))
    print("Fine-tuned Tacotron2 model loaded successfully.")

# Streamlit UI
st.title("Text to Speech Generator")

# Text input for the user
text_input = st.text_area("Enter the text you want to convert to speech:", 
                          "Good morning, lovely listeners! This is your favorite RJ, Sapna...")

# Button to generate speech
if st.button("Generate Speech"):
    if text_input:
        output_file = "output_long_speech.wav"
        
        # Generate speech for the provided text
        with st.spinner("Generating speech..."):
            generate_speech(text_input, tacotron2, hifi_gan, output_file)

        # Provide download link
        st.success("Speech generation complete!")
        st.audio(output_file, format="audio/wav")
        st.download_button(label="Download Speech", data=open(output_file, "rb").read(), file_name=output_file, mime="audio/wav")
    else:
        st.warning("Please enter some text to generate speech.")