speech-to-text / run_demo.py
bofenghuang's picture
up
76e0282
raw
history blame
2.86 kB
import logging
import warnings
import gradio as gr
import librosa
# import torchaudio
from transformers import pipeline
from transformers.utils.logging import disable_progress_bar
warnings.filterwarnings("ignore")
disable_progress_bar()
logging.basicConfig(
format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
datefmt="%Y-%m-%dT%H:%M:%SZ",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french"
SAMPLE_RATE = 16_000
pipe = pipeline(model=MODEL_NAME)
logger.info("ASR pipeline has been initialized")
def process_audio_file(audio_file):
# waveform, sample_rate = torchaudio.load(audio_file)
# waveform = waveform.squeeze(axis=0) # mono
# # resample
# if sample_rate != SAMPLE_RATE:
# resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
# waveform = resampler(waveform)
waveform, sample_rate = librosa.load(audio_file, mono=True)
# resample
if sample_rate != SAMPLE_RATE:
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
return waveform
def transcribe(microphone_audio_file, uploaded_audio_file):
warning_message = ""
if (microphone_audio_file is not None) and (uploaded_audio_file is not None):
warning_message = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
audio_file = microphone_audio_file
elif (microphone_audio_file is None) and (uploaded_audio_file is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone_audio_file is not None:
audio_file = microphone_audio_file
else:
audio_file = uploaded_audio_file
audio_data = process_audio_file(audio_file)
# text = pipe(audio_data)["text"]
text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"]
logger.info(f"Transcription for {audio_file}: {text}")
return warning_message + text
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True),
],
outputs="text",
layout="horizontal",
# theme="huggingface",
title="Speech-to-Text in French",
description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files of arbitrary length.",
allow_flagging="never",
)
# iface.launch(server_name="0.0.0.0", debug=True, share=True)
iface.launch(enable_queue=True)