Spaces:

ganga4364
/

mms-tts-bod

Sleeping

App Files Files Community

ganga4364 commited on Oct 8, 2024

Commit

e6c6652

verified ·

1 Parent(s): ad0ff30

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -79

app.py CHANGED Viewed

@@ -1,94 +1,35 @@
 import gradio as gr
-import os
-import soundfile as sf
-import uuid
-import datetime
-import shutil
-from transformers import pipeline
 import scipy.io.wavfile
 import numpy as np
-# Description for the Gradio interface
-this_description = """Text To Speech for Tibetan - using your fine-tuned TTS model."""
-# Load your custom TTS model and processor for inference
-model_id = "ganga4364/mms-tts-bod-female"  # Replace with your fine-tuned model's ID
-# Use the text-to-speech pipeline with the custom model
-synthesiser = pipeline("text-to-speech", model_id)  # Use GPU if available
-# Custom function to split Tibetan text into sentences
-def prepare_sentences(text, lang="bod"):
-    # Convert Tibetan punctuation "།" into a period to help split sentences
-    text = text.replace("། ", ".")
-    # Split the text into sentences based on the periods
-    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
-    return sentences
-# Function to combine all generated WAV files into a single file
-def combine_wav(source_dir, stamp):
-    # Get a list of all WAV files in the folder
-    wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
-    # Sort the files alphabetically to ensure the correct order of combination
-    wav_files.sort()
-    # Combine the WAV files
-    combined_data = []
-    sr = None
-    for file in wav_files:
-        file_path = os.path.join(source_dir, file)
-        data, sample_rate = sf.read(file_path)
-        if sr is None:
-            sr = sample_rate  # Set the sample rate based on the first file
-        combined_data.extend(data)
-    # Save the combined audio to a new WAV file
-    combined_file_path = f"{stamp}_combined.wav"
-    sf.write(combined_file_path, combined_data, sr)
-    # Clean up temporary files
-    shutil.rmtree(source_dir)
-    return combined_file_path
-# Main function to process Tibetan text and generate audio
-def tts_tibetan(input_text):
-    # Prepare sentences from the input text using the custom function
-    sentences = prepare_sentences(input_text)
-    # Create a unique directory for storing audio chunks
-    current_datetime = datetime.datetime.now()
-    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
-    user_dir = f"u_{timestamp}"
-    os.makedirs(user_dir, exist_ok=True)
-    # Generate audio for each sentence using your custom TTS model
-    for i, sentence in enumerate(sentences):
-        # Perform TTS inference for each sentence
-        speech = synthesiser(sentence)
-        # Save each sentence as a separate WAV file
-        wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
-        scipy.io.wavfile.write(wav_path, rate=speech["sampling_rate"], data=speech["audio"][0])
-    # Combine the generated audio into one file
-    combined_file_path = combine_wav(user_dir, timestamp)
-    # Return the path of the combined audio file for Gradio to handle
-    return combined_file_path
 # Create the Gradio interface
 iface = gr.Interface(
-    fn=tts_tibetan,
-    inputs="text",
-    outputs="audio",  # Output should be the combined audio file
-    title="Tibetan TTS Model",
-    description=this_description
 )
 # Launch the Gradio interface
-iface.launch()

 import gradio as gr
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import scipy.io.wavfile
 import numpy as np
+# Load the MMS-TTS model and processor for Tibetan (bod)
+model_id = "ganga4364/mms-tts-bod-female"  # Replace with your fine-tuned model if necessary
+# Use the text-to-speech pipeline with the model
+synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU
+# Function to perform TTS inference and save audio to a file
+def generate_audio(input_text):
+    # Perform TTS inference
+    speech = synthesiser(input_text)
+    file_path = "finetuned_output.wav"
+    # Save the audio to a file (e.g., 'output.wav')
+    scipy.io.wavfile.write(file_path, rate=speech["sampling_rate"], data=speech["audio"][0])
+    # Return the path to the audio file
+    return file_path
 # Create the Gradio interface
 iface = gr.Interface(
+    fn=generate_audio,
+    inputs="text",  # Text input for the TTS
+    outputs="audio",  # Output will be an audio file
+    title="Tibetan Text-to-Speech (MMS-TTS)",
+    description="Enter Tibetan text and generate speech using MMS-TTS."
 )
 # Launch the Gradio interface
+iface.launch()