LPhilp1943 commited on
Commit
aad7d40
1 Parent(s): 1064862

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -2,41 +2,46 @@ import gradio as gr
2
  import os
3
  import torch
4
  import soundfile as sf
5
- import torchaudio
6
- from transformers import VitsModel, AutoTokenizer
7
- from speechbrain.pretrained import EncoderDecoderASR
8
 
9
  # Ensure the output directory exists
10
  os.makedirs("output_audio", exist_ok=True)
11
 
12
- # Load the Facebook MMS TTS model and tokenizer
13
- model = VitsModel.from_pretrained("facebook/mms-tts-eng")
14
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
15
-
16
- # SpeechBrain ASR Model for Speech to Text
17
- asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
18
 
19
  def speech_to_text(input_audio):
20
- sig, sr = torchaudio.load(input_audio)
21
- if sr != 16000:
22
- sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
23
- transcription = asr_model.transcribe_file(input_audio)
 
 
 
 
 
 
 
24
  return transcription
25
 
26
  def text_to_speech(text):
27
- inputs = tokenizer(text, return_tensors="pt")
 
28
  with torch.no_grad():
29
- output = model(**inputs).waveform
30
  waveform = output.numpy()
31
 
 
32
  output_path = "output_audio/text_to_speech.wav"
33
  sf.write(output_path, waveform.squeeze(), 22050)
34
 
35
  return output_path
36
 
37
  def speech_to_speech(input_audio, target_text):
38
- # Use speech_to_text to transcribe, then synthesize speech from the transcription
39
- transcription = speech_to_text(input_audio)
40
  return text_to_speech(target_text)
41
 
42
  iface = gr.Interface(
@@ -55,11 +60,10 @@ iface = gr.Interface(
55
  "Text to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech"),
56
  "Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
57
  },
58
- title="Speech Processing App",
59
- description="This app uses SpeechBrain for speech to text and Facebook's MMS for text to speech.",
60
  layout="vertical"
61
  )
62
 
63
  if __name__ == "__main__":
64
  iface.launch()
65
-
 
2
  import os
3
  import torch
4
  import soundfile as sf
5
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
 
 
6
 
7
  # Ensure the output directory exists
8
  os.makedirs("output_audio", exist_ok=True)
9
 
10
+ # Load the models and processors
11
+ asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
12
+ asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
13
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
14
+ tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 
15
 
16
  def speech_to_text(input_audio):
17
+ # Load and preprocess the audio
18
+ waveform, sr = sf.read(input_audio)
19
+ input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values
20
+
21
+ # Perform speech recognition
22
+ with torch.no_grad():
23
+ logits = asr_model(input_values).logits
24
+ predicted_ids = torch.argmax(logits, dim=-1)
25
+
26
+ # Decode the predicted IDs to text
27
+ transcription = asr_processor.batch_decode(predicted_ids)[0]
28
  return transcription
29
 
30
  def text_to_speech(text):
31
+ # Tokenize text and generate waveform
32
+ inputs = tts_tokenizer(text, return_tensors="pt")
33
  with torch.no_grad():
34
+ output = tts_model(**inputs).waveform
35
  waveform = output.numpy()
36
 
37
+ # Define output path and save waveform as audio file
38
  output_path = "output_audio/text_to_speech.wav"
39
  sf.write(output_path, waveform.squeeze(), 22050)
40
 
41
  return output_path
42
 
43
  def speech_to_speech(input_audio, target_text):
44
+ # Synthesize speech directly from target text without transcribing the input audio
 
45
  return text_to_speech(target_text)
46
 
47
  iface = gr.Interface(
 
60
  "Text to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech"),
61
  "Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
62
  },
63
+ title="Speech Processing Application",
64
+ description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech.",
65
  layout="vertical"
66
  )
67
 
68
  if __name__ == "__main__":
69
  iface.launch()