LPhilp1943 commited on
Commit
ff017ef
1 Parent(s): 6ef1f34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -28
app.py CHANGED
@@ -1,50 +1,46 @@
1
  import gradio as gr
2
- import torchaudio
3
  import os
 
4
  import soundfile as sf
5
- from speechbrain.pretrained import EncoderClassifier, Tacotron2, HIFIGAN, EncoderDecoderASR
 
 
 
6
 
7
- # Ensure output directory exists
8
  os.makedirs("output_audio", exist_ok=True)
9
 
10
- # Load models
11
- encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="models/encoder")
12
- tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="models/tacotron2")
13
- hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="models/hifigan")
 
14
  asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
15
 
16
  def speech_to_text(input_audio):
17
  sig, sr = torchaudio.load(input_audio)
18
- # Ensure the sample rate is 16000, expected by the model
19
  if sr != 16000:
20
  sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
21
  transcription = asr_model.transcribe_file(input_audio)
22
  return transcription
23
 
24
- def speech_to_speech(input_audio, target_text):
25
- # Load and encode speaker from input audio
26
- signal, fs = torchaudio.load(input_audio)
27
- if fs != 16000:
28
- signal = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(signal)
29
- embedding = encoder.encode_batch(signal)
30
-
31
- # Synthesize speech from text
32
- mel_output, mel_length, alignment = tacotron2.encode_text(target_text, embedding)
33
- waveform = hifigan.decode_batch(mel_output)
34
-
35
- # Save output audio
36
- output_path = "output_audio/synthesized_speech.wav"
37
- sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
38
- return output_path
39
-
40
  def text_to_speech(text):
41
- mel_output, mel_length, alignment = tacotron2.encode_text(text)
42
- waveform = hifigan.decode_batch(mel_output)
 
 
43
 
44
  output_path = "output_audio/text_to_speech.wav"
45
- sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
46
  return output_path
47
 
 
 
 
 
 
 
 
48
  iface = gr.Interface(
49
  fn={
50
  "Speech to Text": speech_to_text,
@@ -62,7 +58,7 @@ iface = gr.Interface(
62
  "Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
63
  },
64
  title="Speech Processing App",
65
- description="Upload an audio file or enter text to perform various speech processing tasks.",
66
  layout="vertical"
67
  )
68
 
 
1
  import gradio as gr
 
2
  import os
3
+ import torch
4
  import soundfile as sf
5
+ import torchaudio
6
+ from scipy.io.wavfile import write
7
+ from transformers import VitsProcessor, VitsForConditionalGeneration
8
+ from speechbrain.pretrained import EncoderClassifier, EncoderDecoderASR
9
 
10
+ # Ensure the output directory exists
11
  os.makedirs("output_audio", exist_ok=True)
12
 
13
+ # Load the Facebook MMS TTS model and processor
14
+ tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
15
+ tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
16
+
17
+ # SpeechBrain ASR Model for Speech to Text
18
  asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
19
 
20
  def speech_to_text(input_audio):
21
  sig, sr = torchaudio.load(input_audio)
 
22
  if sr != 16000:
23
  sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
24
  transcription = asr_model.transcribe_file(input_audio)
25
  return transcription
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def text_to_speech(text):
28
+ inputs = tts_processor(text, return_tensors="pt")
29
+ with torch.no_grad():
30
+ generated = tts_model.generate(**inputs)
31
+ waveform = generated.audio.squeeze().cpu().numpy()
32
 
33
  output_path = "output_audio/text_to_speech.wav"
34
+ sf.write(output_path, waveform, 22050)
35
  return output_path
36
 
37
+ def speech_to_speech(input_audio, target_text):
38
+ # Speech to Text
39
+ transcription = speech_to_text(input_audio)
40
+
41
+ # Text to Speech with Facebook MMS TTS
42
+ return text_to_speech(target_text)
43
+
44
  iface = gr.Interface(
45
  fn={
46
  "Speech to Text": speech_to_text,
 
58
  "Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
59
  },
60
  title="Speech Processing App",
61
+ description="This app uses SpeechBrain for speech to text and Facebook's MMS for text to speech.",
62
  layout="vertical"
63
  )
64