LPhilp1943 commited on
Commit
1064862
1 Parent(s): c301c7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -3,16 +3,15 @@ import os
3
  import torch
4
  import soundfile as sf
5
  import torchaudio
6
- from scipy.io.wavfile import write
7
- from transformers import VitsProcessor, VitsForConditionalGeneration
8
- from speechbrain.pretrained import EncoderClassifier, EncoderDecoderASR
9
 
10
  # Ensure the output directory exists
11
  os.makedirs("output_audio", exist_ok=True)
12
 
13
- # Load the Facebook MMS TTS model and processor
14
- tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
15
- tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
16
 
17
  # SpeechBrain ASR Model for Speech to Text
18
  asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
@@ -25,20 +24,19 @@ def speech_to_text(input_audio):
25
  return transcription
26
 
27
  def text_to_speech(text):
28
- inputs = tts_processor(text, return_tensors="pt")
29
  with torch.no_grad():
30
- generated = tts_model.generate(**inputs)
31
- waveform = generated.audio.squeeze().cpu().numpy()
32
 
33
  output_path = "output_audio/text_to_speech.wav"
34
- sf.write(output_path, waveform, 22050)
 
35
  return output_path
36
 
37
  def speech_to_speech(input_audio, target_text):
38
- # Speech to Text
39
  transcription = speech_to_text(input_audio)
40
-
41
- # Text to Speech with Facebook MMS TTS
42
  return text_to_speech(target_text)
43
 
44
  iface = gr.Interface(
@@ -64,3 +62,4 @@ iface = gr.Interface(
64
 
65
  if __name__ == "__main__":
66
  iface.launch()
 
 
3
  import torch
4
  import soundfile as sf
5
  import torchaudio
6
+ from transformers import VitsModel, AutoTokenizer
7
+ from speechbrain.pretrained import EncoderDecoderASR
 
8
 
9
  # Ensure the output directory exists
10
  os.makedirs("output_audio", exist_ok=True)
11
 
12
+ # Load the Facebook MMS TTS model and tokenizer
13
+ model = VitsModel.from_pretrained("facebook/mms-tts-eng")
14
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
15
 
16
  # SpeechBrain ASR Model for Speech to Text
17
  asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
 
24
  return transcription
25
 
26
  def text_to_speech(text):
27
+ inputs = tokenizer(text, return_tensors="pt")
28
  with torch.no_grad():
29
+ output = model(**inputs).waveform
30
+ waveform = output.numpy()
31
 
32
  output_path = "output_audio/text_to_speech.wav"
33
+ sf.write(output_path, waveform.squeeze(), 22050)
34
+
35
  return output_path
36
 
37
  def speech_to_speech(input_audio, target_text):
38
+ # Use speech_to_text to transcribe, then synthesize speech from the transcription
39
  transcription = speech_to_text(input_audio)
 
 
40
  return text_to_speech(target_text)
41
 
42
  iface = gr.Interface(
 
62
 
63
  if __name__ == "__main__":
64
  iface.launch()
65
+