annapurnapadmaprema-ji commited on
Commit
0065d28
·
verified ·
1 Parent(s): f108014

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ import soundfile as sf
6
+ import streamlit as st
7
+ from tqdm import tqdm
8
+ from speechbrain.pretrained import Tacotron2, HIFIGAN
9
+
10
+ # Paths
11
+ output_path = "./processed_data/"
12
+ os.makedirs(output_path, exist_ok=True)
13
+
14
+ # Preprocessing Function
15
+ def preprocess_audio(audio_path, max_length=1000):
16
+ """
17
+ Preprocess the audio file to generate mel spectrogram with uniform length.
18
+ """
19
+ wav, sr = librosa.load(audio_path, sr=24000)
20
+ mel_spectrogram = librosa.feature.melspectrogram(
21
+ y=wav, sr=sr, n_fft=2048, hop_length=256, n_mels=120
22
+ )
23
+ mel_spectrogram = np.log(np.maximum(1e-5, mel_spectrogram)) # Log normalization
24
+
25
+ # Ensure all mel spectrograms have the same time dimension
26
+ if mel_spectrogram.shape[1] > max_length: # Truncate
27
+ mel_spectrogram = mel_spectrogram[:, :max_length]
28
+ else: # Pad
29
+ padding = max_length - mel_spectrogram.shape[1]
30
+ mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, padding)), mode="constant")
31
+
32
+ return mel_spectrogram
33
+
34
+ # Function to Split Long Text into Chunks
35
+ def split_text_into_chunks(text, max_chunk_length=200):
36
+ """
37
+ Splits the input text into smaller chunks, each of up to `max_chunk_length` characters.
38
+ """
39
+ words = text.split()
40
+ chunks = []
41
+ current_chunk = []
42
+ current_length = 0
43
+
44
+ for word in words:
45
+ if current_length + len(word) + 1 > max_chunk_length:
46
+ chunks.append(" ".join(current_chunk))
47
+ current_chunk = []
48
+ current_length = 0
49
+ current_chunk.append(word)
50
+ current_length += len(word) + 1 # Account for space
51
+
52
+ if current_chunk:
53
+ chunks.append(" ".join(current_chunk))
54
+
55
+ return chunks
56
+
57
+ # Generate Speech for Long Text
58
+ def generate_speech(text, tacotron2, hifi_gan, output_file="long_speech.wav", sample_rate=24000):
59
+ """
60
+ Generates a long speech by splitting the text into chunks, generating audio for each,
61
+ and concatenating the waveforms.
62
+ """
63
+ chunks = split_text_into_chunks(text)
64
+ waveforms = []
65
+
66
+ for chunk in tqdm(chunks, desc="Generating speech"):
67
+ text_input = [str(chunk)]
68
+ mel_output, mel_length, alignment = tacotron2.encode_batch(text_input)
69
+ waveform = hifi_gan.decode_batch(mel_output)
70
+ waveforms.append(waveform.squeeze().cpu().numpy())
71
+
72
+ # Concatenate waveforms
73
+ long_waveform = np.concatenate(waveforms, axis=0)
74
+
75
+ # Save the concatenated audio
76
+ sf.write(output_file, long_waveform, sample_rate)
77
+ print(f"Audio has been synthesized and saved as '{output_file}'.")
78
+
79
+ # Load Pretrained Tacotron2 and HiFi-GAN
80
+ tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tacotron2")
81
+ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_hifigan")
82
+
83
+ # Fine-tuned model (if available)
84
+ if os.path.exists("indic_accent_tacotron2.pth"):
85
+ tacotron2.load_state_dict(torch.load("indic_accent_tacotron2.pth"))
86
+ print("Fine-tuned Tacotron2 model loaded successfully.")
87
+
88
+ # Streamlit UI
89
+ st.title("Text to Speech Generator")
90
+
91
+ # Text input for the user
92
+ text_input = st.text_area("Enter the text you want to convert to speech:",
93
+ "Good morning, lovely listeners! This is your favorite RJ, Sapna...")
94
+
95
+ # Button to generate speech
96
+ if st.button("Generate Speech"):
97
+ if text_input:
98
+ output_file = "output_long_speech.wav"
99
+
100
+ # Generate speech for the provided text
101
+ with st.spinner("Generating speech..."):
102
+ generate_speech(text_input, tacotron2, hifi_gan, output_file)
103
+
104
+ # Provide download link
105
+ st.success("Speech generation complete!")
106
+ st.audio(output_file, format="audio/wav")
107
+ st.download_button(label="Download Speech", data=open(output_file, "rb").read(), file_name=output_file, mime="audio/wav")
108
+ else:
109
+ st.warning("Please enter some text to generate speech.")