Spaces:

Tejasva-Maurya
/

Hindi_SpeechT5_finetuned

Running

App Files Files Community

Tejasva-Maurya commited on Oct 25, 2024

Commit

3ff113e

verified ·

1 Parent(s): 1eb0357

Create app.py

Browse files

Files changed (1) hide show

app.py +166 -0

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr
+import torch
+import python_multipart
+import os
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset, Audio
+import numpy as np
+from speechbrain.inference import EncoderClassifier
+# Load models and processor
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("Tejasva-Maurya/Hindi_SpeechT5_finetuned")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Load speaker encoder
+device = "cuda" if torch.cuda.is_available() else "cpu"
+speaker_model = EncoderClassifier.from_hparams(
+    source="speechbrain/spkrec-xvect-voxceleb",
+    run_opts={"device": device},
+    savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
+)
+def create_speaker_embedding(waveform):
+      with torch.no_grad():
+          speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
+          speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+          speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+      return speaker_embeddings
+def prepare_dataset(example):
+    audio = example["audio"]
+    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
+    return example
+# Load a sample from the dataset for speaker embedding
+try:
+  dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True)
+  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+  spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+  device = "cuda" if torch.cuda.is_available() else "cpu"
+  speaker_model = EncoderClassifier.from_hparams(
+      source=spk_model_name,
+      run_opts={"device": device},
+      savedir=os.path.join("/tmp", spk_model_name),
+  )
+  # Calculate the number of rows for a part of the dataset
+  part = len(dataset) //800
+  # Select the part of the dataset
+  dataset = dataset.select(range(part))
+  # Prepare the dataset
+  dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
+  example = dataset[10]
+  speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+except Exception as e:
+    print(f"Error loading dataset: {e}")
+    # Use a random speaker embedding as fallback
+    speaker_embedding = torch.randn(1, 512)
+def text_preprocessing(text):
+     replacements = [
+       # Vowels and vowel matras
+       ("अ", "a"),
+       ("आ", "aa"),
+       ("इ", "i"),
+       ("ई", "ee"),
+       ("उ", "u"),
+       ("ऊ", "oo"),
+       ("ऋ", "ri"),
+       ("ए", "e"),
+       ("ऐ", "ai"),
+       ("ऑ", "o"),  # More accurate than 'au' for ऑ
+       ("ओ", "o"),
+       ("औ", "au"),
+       # Consonants
+       ("क", "k"),
+       ("ख", "kh"),
+       ("ग", "g"),
+       ("घ", "gh"),
+       ("ङ", "ng"),  # nasal sound
+       ("च", "ch"),
+       ("छ", "chh"),
+       ("ज", "j"),
+       ("झ", "jh"),
+       ("ञ", "ny"),  # 'ny' closer to the actual sound
+       ("ट", "t"),
+       ("ठ", "th"),
+       ("ड", "d"),
+       ("ढ", "dh"),
+       ("ण", "n"),  # Slight improvement for easier pronunciation
+       ("त", "t"),
+       ("थ", "th"),
+       ("द", "d"),
+       ("ध", "dh"),
+       ("न", "n"),
+       ("प", "p"),
+       ("फ", "ph"),
+       ("ब", "b"),
+       ("भ", "bh"),
+       ("म", "m"),
+       ("य", "y"),
+       ("र", "r"),
+       ("ल", "l"),
+       ("व", "v"),  # 'v' is closer to the Hindi 'व'
+       ("श", "sh"),
+       ("ष", "sh"),  # Same sound in modern pronunciation
+       ("स", "s"),
+       ("ह", "h"),
+       # Consonant clusters and special consonants
+       ("क्ष", "ksh"),
+       ("त्र", "tr"),
+       ("ज्ञ", "gya"),
+       ("श्र", "shra"),
+       # Special characters
+       ("़", ""),    # Ignore nukta; can vary with regional pronunciation
+       ("्", ""),    # Halant - schwa dropping (handled contextually)
+       ("ऽ", ""),    # Avagraha - no direct pronunciation, often ignored
+       ("ं", "n"),   # Anusvara - nasalization
+       ("ः", "h"),   # Visarga - adds an 'h' sound
+       ("ँ", "n"),   # Chandrabindu - nasalization
+       # Vowel matras (diacritic marks)
+       ("ा", "a"),
+       ("ि", "i"),
+       ("ी", "ee"),
+       ("ु", "u"),
+       ("ू", "oo"),
+       ("े", "e"),
+       ("ै", "ai"),
+       ("ो", "o"),
+       ("ौ", "au"),
+       ("ृ", "ri"),  # Vowel-matra equivalent of ऋ
+       # Nasalization and other marks
+       ("ॅ", "e"),   # Short 'e' sound (very rare)
+       ("ॉ", "o"),   # Short 'o' sound (very rare)
+       # Loanwords and aspirated consonants
+       ("क़", "q"),
+       ("ख़", "kh"),
+       ("ग़", "gh"),
+       ("ज़", "z"),
+       ("ड़", "r"),
+       ("ढ़", "rh"),
+       ("फ़", "f"),
+       # Punctuation
+       ("।", "."),   # Hindi sentence-ending marker -> period
+   ]
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    for src, dst in replacements:
+      text = text.replace(src, dst)
+    return text
+    inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
+    return (16000, speech.numpy())
+iface = gr.Interface(
+    fn=text_to_speech,
+    inputs="text",
+    outputs="audio",
+    title="SpeechT5 finetuned Hindi Text-to-Speech",
+    description="Enter Hindi text to convert it into an Audio"
+)
+iface.launch(share=True)