ganga4364 commited on
Commit
e6c6652
·
verified ·
1 Parent(s): ad0ff30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -79
app.py CHANGED
@@ -1,94 +1,35 @@
1
  import gradio as gr
2
- import os
3
- import soundfile as sf
4
- import uuid
5
- import datetime
6
- import shutil
7
- from transformers import pipeline
8
  import scipy.io.wavfile
9
  import numpy as np
10
 
11
- # Description for the Gradio interface
12
- this_description = """Text To Speech for Tibetan - using your fine-tuned TTS model."""
13
 
14
- # Load your custom TTS model and processor for inference
15
- model_id = "ganga4364/mms-tts-bod-female" # Replace with your fine-tuned model's ID
16
 
 
 
17
 
18
- # Use the text-to-speech pipeline with the custom model
19
- synthesiser = pipeline("text-to-speech", model_id) # Use GPU if available
20
 
21
- # Custom function to split Tibetan text into sentences
22
- def prepare_sentences(text, lang="bod"):
23
- # Convert Tibetan punctuation "།" into a period to help split sentences
24
- text = text.replace("། ", ".")
25
-
26
- # Split the text into sentences based on the periods
27
- sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
28
-
29
- return sentences
30
 
31
- # Function to combine all generated WAV files into a single file
32
- def combine_wav(source_dir, stamp):
33
- # Get a list of all WAV files in the folder
34
- wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
35
-
36
- # Sort the files alphabetically to ensure the correct order of combination
37
- wav_files.sort()
38
-
39
- # Combine the WAV files
40
- combined_data = []
41
- sr = None
42
- for file in wav_files:
43
- file_path = os.path.join(source_dir, file)
44
- data, sample_rate = sf.read(file_path)
45
- if sr is None:
46
- sr = sample_rate # Set the sample rate based on the first file
47
- combined_data.extend(data)
48
-
49
- # Save the combined audio to a new WAV file
50
- combined_file_path = f"{stamp}_combined.wav"
51
- sf.write(combined_file_path, combined_data, sr)
52
-
53
- # Clean up temporary files
54
- shutil.rmtree(source_dir)
55
-
56
- return combined_file_path
57
-
58
- # Main function to process Tibetan text and generate audio
59
- def tts_tibetan(input_text):
60
- # Prepare sentences from the input text using the custom function
61
- sentences = prepare_sentences(input_text)
62
-
63
- # Create a unique directory for storing audio chunks
64
- current_datetime = datetime.datetime.now()
65
- timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
66
- user_dir = f"u_{timestamp}"
67
- os.makedirs(user_dir, exist_ok=True)
68
-
69
- # Generate audio for each sentence using your custom TTS model
70
- for i, sentence in enumerate(sentences):
71
- # Perform TTS inference for each sentence
72
- speech = synthesiser(sentence)
73
-
74
- # Save each sentence as a separate WAV file
75
- wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
76
- scipy.io.wavfile.write(wav_path, rate=speech["sampling_rate"], data=speech["audio"][0])
77
-
78
- # Combine the generated audio into one file
79
- combined_file_path = combine_wav(user_dir, timestamp)
80
-
81
- # Return the path of the combined audio file for Gradio to handle
82
- return combined_file_path
83
 
84
  # Create the Gradio interface
85
  iface = gr.Interface(
86
- fn=tts_tibetan,
87
- inputs="text",
88
- outputs="audio", # Output should be the combined audio file
89
- title="Tibetan TTS Model",
90
- description=this_description
91
  )
92
 
93
  # Launch the Gradio interface
94
- iface.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
 
 
 
 
3
  import scipy.io.wavfile
4
  import numpy as np
5
 
6
+ # Load the MMS-TTS model and processor for Tibetan (bod)
7
+ model_id = "ganga4364/mms-tts-bod-female" # Replace with your fine-tuned model if necessary
8
 
 
 
9
 
10
+ # Use the text-to-speech pipeline with the model
11
+ synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU
12
 
 
 
13
 
14
+ # Function to perform TTS inference and save audio to a file
15
+ def generate_audio(input_text):
16
+ # Perform TTS inference
17
+ speech = synthesiser(input_text)
18
+ file_path = "finetuned_output.wav"
19
+ # Save the audio to a file (e.g., 'output.wav')
20
+ scipy.io.wavfile.write(file_path, rate=speech["sampling_rate"], data=speech["audio"][0])
 
 
21
 
22
+ # Return the path to the audio file
23
+ return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Create the Gradio interface
26
  iface = gr.Interface(
27
+ fn=generate_audio,
28
+ inputs="text", # Text input for the TTS
29
+ outputs="audio", # Output will be an audio file
30
+ title="Tibetan Text-to-Speech (MMS-TTS)",
31
+ description="Enter Tibetan text and generate speech using MMS-TTS."
32
  )
33
 
34
  # Launch the Gradio interface
35
+ iface.launch()