Tejasva-Maurya commited on
Commit
3ff113e
·
verified ·
1 Parent(s): 1eb0357

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import python_multipart
4
+ import os
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset, Audio
7
+ import numpy as np
8
+ from speechbrain.inference import EncoderClassifier
9
+
10
+ # Load models and processor
11
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
+ model = SpeechT5ForTextToSpeech.from_pretrained("Tejasva-Maurya/Hindi_SpeechT5_finetuned")
13
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
+
15
+ # Load speaker encoder
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ speaker_model = EncoderClassifier.from_hparams(
18
+ source="speechbrain/spkrec-xvect-voxceleb",
19
+ run_opts={"device": device},
20
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
21
+ )
22
+
23
+ def create_speaker_embedding(waveform):
24
+ with torch.no_grad():
25
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
26
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
27
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
28
+ return speaker_embeddings
29
+ def prepare_dataset(example):
30
+ audio = example["audio"]
31
+ example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
32
+ return example
33
+
34
+ # Load a sample from the dataset for speaker embedding
35
+ try:
36
+ dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True)
37
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
38
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
39
+ device = "cuda" if torch.cuda.is_available() else "cpu"
40
+ speaker_model = EncoderClassifier.from_hparams(
41
+ source=spk_model_name,
42
+ run_opts={"device": device},
43
+ savedir=os.path.join("/tmp", spk_model_name),
44
+ )
45
+ # Calculate the number of rows for a part of the dataset
46
+ part = len(dataset) //800
47
+
48
+ # Select the part of the dataset
49
+ dataset = dataset.select(range(part))
50
+
51
+ # Prepare the dataset
52
+ dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
53
+ example = dataset[10]
54
+ speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
55
+
56
+ except Exception as e:
57
+ print(f"Error loading dataset: {e}")
58
+ # Use a random speaker embedding as fallback
59
+ speaker_embedding = torch.randn(1, 512)
60
+
61
+ def text_preprocessing(text):
62
+ replacements = [
63
+ # Vowels and vowel matras
64
+ ("अ", "a"),
65
+ ("आ", "aa"),
66
+ ("इ", "i"),
67
+ ("ई", "ee"),
68
+ ("उ", "u"),
69
+ ("ऊ", "oo"),
70
+ ("ऋ", "ri"),
71
+ ("ए", "e"),
72
+ ("ऐ", "ai"),
73
+ ("ऑ", "o"), # More accurate than 'au' for ऑ
74
+ ("ओ", "o"),
75
+ ("औ", "au"),
76
+ # Consonants
77
+ ("क", "k"),
78
+ ("ख", "kh"),
79
+ ("ग", "g"),
80
+ ("घ", "gh"),
81
+ ("ङ", "ng"), # nasal sound
82
+ ("च", "ch"),
83
+ ("छ", "chh"),
84
+ ("ज", "j"),
85
+ ("झ", "jh"),
86
+ ("ञ", "ny"), # 'ny' closer to the actual sound
87
+ ("ट", "t"),
88
+ ("ठ", "th"),
89
+ ("ड", "d"),
90
+ ("ढ", "dh"),
91
+ ("ण", "n"), # Slight improvement for easier pronunciation
92
+ ("त", "t"),
93
+ ("थ", "th"),
94
+ ("द", "d"),
95
+ ("ध", "dh"),
96
+ ("न", "n"),
97
+ ("प", "p"),
98
+ ("फ", "ph"),
99
+ ("ब", "b"),
100
+ ("भ", "bh"),
101
+ ("म", "m"),
102
+ ("य", "y"),
103
+ ("र", "r"),
104
+ ("ल", "l"),
105
+ ("व", "v"), # 'v' is closer to the Hindi 'व'
106
+ ("श", "sh"),
107
+ ("ष", "sh"), # Same sound in modern pronunciation
108
+ ("स", "s"),
109
+ ("ह", "h"),
110
+ # Consonant clusters and special consonants
111
+ ("क्ष", "ksh"),
112
+ ("त्र", "tr"),
113
+ ("ज्ञ", "gya"),
114
+ ("श्र", "shra"),
115
+ # Special characters
116
+ ("़", ""), # Ignore nukta; can vary with regional pronunciation
117
+ ("्", ""), # Halant - schwa dropping (handled contextually)
118
+ ("ऽ", ""), # Avagraha - no direct pronunciation, often ignored
119
+ ("ं", "n"), # Anusvara - nasalization
120
+ ("ः", "h"), # Visarga - adds an 'h' sound
121
+ ("ँ", "n"), # Chandrabindu - nasalization
122
+ # Vowel matras (diacritic marks)
123
+ ("ा", "a"),
124
+ ("ि", "i"),
125
+ ("ी", "ee"),
126
+ ("ु", "u"),
127
+ ("ू", "oo"),
128
+ ("े", "e"),
129
+ ("ै", "ai"),
130
+ ("ो", "o"),
131
+ ("ौ", "au"),
132
+ ("ृ", "ri"), # Vowel-matra equivalent of ऋ
133
+ # Nasalization and other marks
134
+ ("ॅ", "e"), # Short 'e' sound (very rare)
135
+ ("ॉ", "o"), # Short 'o' sound (very rare)
136
+ # Loanwords and aspirated consonants
137
+ ("क़", "q"),
138
+ ("ख़", "kh"),
139
+ ("ग़", "gh"),
140
+ ("ज़", "z"),
141
+ ("ड़", "r"),
142
+ ("ढ़", "rh"),
143
+ ("फ़", "f"),
144
+ # Punctuation
145
+ ("।", "."), # Hindi sentence-ending marker -> period
146
+ ]
147
+
148
+ # Remove extra whitespace
149
+ text = ' '.join(text.split())
150
+ for src, dst in replacements:
151
+ text = text.replace(src, dst)
152
+ return text
153
+
154
+ inputs = processor(text=text, return_tensors="pt")
155
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
156
+ return (16000, speech.numpy())
157
+
158
+ iface = gr.Interface(
159
+ fn=text_to_speech,
160
+ inputs="text",
161
+ outputs="audio",
162
+ title="SpeechT5 finetuned Hindi Text-to-Speech",
163
+ description="Enter Hindi text to convert it into an Audio"
164
+ )
165
+
166
+ iface.launch(share=True)