Spaces:
Build error
Build error
LPhilp1943
commited on
Commit
•
ff017ef
1
Parent(s):
6ef1f34
Update app.py
Browse files
app.py
CHANGED
@@ -1,50 +1,46 @@
|
|
1 |
import gradio as gr
|
2 |
-
import torchaudio
|
3 |
import os
|
|
|
4 |
import soundfile as sf
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
-
# Ensure output directory exists
|
8 |
os.makedirs("output_audio", exist_ok=True)
|
9 |
|
10 |
-
# Load
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
|
15 |
|
16 |
def speech_to_text(input_audio):
|
17 |
sig, sr = torchaudio.load(input_audio)
|
18 |
-
# Ensure the sample rate is 16000, expected by the model
|
19 |
if sr != 16000:
|
20 |
sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
|
21 |
transcription = asr_model.transcribe_file(input_audio)
|
22 |
return transcription
|
23 |
|
24 |
-
def speech_to_speech(input_audio, target_text):
|
25 |
-
# Load and encode speaker from input audio
|
26 |
-
signal, fs = torchaudio.load(input_audio)
|
27 |
-
if fs != 16000:
|
28 |
-
signal = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(signal)
|
29 |
-
embedding = encoder.encode_batch(signal)
|
30 |
-
|
31 |
-
# Synthesize speech from text
|
32 |
-
mel_output, mel_length, alignment = tacotron2.encode_text(target_text, embedding)
|
33 |
-
waveform = hifigan.decode_batch(mel_output)
|
34 |
-
|
35 |
-
# Save output audio
|
36 |
-
output_path = "output_audio/synthesized_speech.wav"
|
37 |
-
sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
|
38 |
-
return output_path
|
39 |
-
|
40 |
def text_to_speech(text):
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
|
44 |
output_path = "output_audio/text_to_speech.wav"
|
45 |
-
sf.write(output_path, waveform
|
46 |
return output_path
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
iface = gr.Interface(
|
49 |
fn={
|
50 |
"Speech to Text": speech_to_text,
|
@@ -62,7 +58,7 @@ iface = gr.Interface(
|
|
62 |
"Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
|
63 |
},
|
64 |
title="Speech Processing App",
|
65 |
-
description="
|
66 |
layout="vertical"
|
67 |
)
|
68 |
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import os
|
3 |
+
import torch
|
4 |
import soundfile as sf
|
5 |
+
import torchaudio
|
6 |
+
from scipy.io.wavfile import write
|
7 |
+
from transformers import VitsProcessor, VitsForConditionalGeneration
|
8 |
+
from speechbrain.pretrained import EncoderClassifier, EncoderDecoderASR
|
9 |
|
10 |
+
# Ensure the output directory exists
|
11 |
os.makedirs("output_audio", exist_ok=True)
|
12 |
|
13 |
+
# Load the Facebook MMS TTS model and processor
|
14 |
+
tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
|
15 |
+
tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
|
16 |
+
|
17 |
+
# SpeechBrain ASR Model for Speech to Text
|
18 |
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
|
19 |
|
20 |
def speech_to_text(input_audio):
|
21 |
sig, sr = torchaudio.load(input_audio)
|
|
|
22 |
if sr != 16000:
|
23 |
sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
|
24 |
transcription = asr_model.transcribe_file(input_audio)
|
25 |
return transcription
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def text_to_speech(text):
|
28 |
+
inputs = tts_processor(text, return_tensors="pt")
|
29 |
+
with torch.no_grad():
|
30 |
+
generated = tts_model.generate(**inputs)
|
31 |
+
waveform = generated.audio.squeeze().cpu().numpy()
|
32 |
|
33 |
output_path = "output_audio/text_to_speech.wav"
|
34 |
+
sf.write(output_path, waveform, 22050)
|
35 |
return output_path
|
36 |
|
37 |
+
def speech_to_speech(input_audio, target_text):
|
38 |
+
# Speech to Text
|
39 |
+
transcription = speech_to_text(input_audio)
|
40 |
+
|
41 |
+
# Text to Speech with Facebook MMS TTS
|
42 |
+
return text_to_speech(target_text)
|
43 |
+
|
44 |
iface = gr.Interface(
|
45 |
fn={
|
46 |
"Speech to Text": speech_to_text,
|
|
|
58 |
"Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
|
59 |
},
|
60 |
title="Speech Processing App",
|
61 |
+
description="This app uses SpeechBrain for speech to text and Facebook's MMS for text to speech.",
|
62 |
layout="vertical"
|
63 |
)
|
64 |
|