LPhilp1943 commited on
Commit
6ef1f34
1 Parent(s): 7cbb513

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  import torchaudio
3
- from speechbrain.pretrained import EncoderClassifier, Tacotron2, HIFIGAN, ASR
4
  import os
5
  import soundfile as sf
 
6
 
7
  # Ensure output directory exists
8
  os.makedirs("output_audio", exist_ok=True)
@@ -11,11 +11,14 @@ os.makedirs("output_audio", exist_ok=True)
11
  encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="models/encoder")
12
  tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="models/tacotron2")
13
  hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="models/hifigan")
14
- asr = ASR.from_hparams(source="speechbrain/asr-transformer-librispeech", savedir="models/asr")
15
 
16
  def speech_to_text(input_audio):
17
  sig, sr = torchaudio.load(input_audio)
18
- transcription = asr.transcribe_file(input_audio)
 
 
 
19
  return transcription
20
 
21
  def speech_to_speech(input_audio, target_text):
 
1
  import gradio as gr
2
  import torchaudio
 
3
  import os
4
  import soundfile as sf
5
+ from speechbrain.pretrained import EncoderClassifier, Tacotron2, HIFIGAN, EncoderDecoderASR
6
 
7
  # Ensure output directory exists
8
  os.makedirs("output_audio", exist_ok=True)
 
11
  encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="models/encoder")
12
  tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="models/tacotron2")
13
  hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="models/hifigan")
14
+ asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
15
 
16
  def speech_to_text(input_audio):
17
  sig, sr = torchaudio.load(input_audio)
18
+ # Ensure the sample rate is 16000, expected by the model
19
+ if sr != 16000:
20
+ sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
21
+ transcription = asr_model.transcribe_file(input_audio)
22
  return transcription
23
 
24
  def speech_to_speech(input_audio, target_text):