Spaces:
Running
Running
import logging | |
import warnings | |
import gradio as gr | |
import librosa | |
# import torchaudio | |
from transformers import pipeline | |
from transformers.utils.logging import disable_progress_bar | |
warnings.filterwarnings("ignore") | |
disable_progress_bar() | |
logging.basicConfig( | |
format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s", | |
datefmt="%Y-%m-%dT%H:%M:%SZ", | |
) | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french" | |
SAMPLE_RATE = 16_000 | |
pipe = pipeline(model=MODEL_NAME) | |
logger.info("ASR pipeline has been initialized") | |
def process_audio_file(audio_file): | |
# waveform, sample_rate = torchaudio.load(audio_file) | |
# waveform = waveform.squeeze(axis=0) # mono | |
# # resample | |
# if sample_rate != SAMPLE_RATE: | |
# resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE) | |
# waveform = resampler(waveform) | |
waveform, sample_rate = librosa.load(audio_file, mono=True) | |
# resample | |
if sample_rate != SAMPLE_RATE: | |
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE) | |
return waveform | |
def transcribe(microphone_audio_file, uploaded_audio_file): | |
warning_message = "" | |
if (microphone_audio_file is not None) and (uploaded_audio_file is not None): | |
warning_message = ( | |
"WARNING: You've uploaded an audio file and used the microphone. " | |
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
) | |
audio_file = microphone_audio_file | |
elif (microphone_audio_file is None) and (uploaded_audio_file is None): | |
return "ERROR: You have to either use the microphone or upload an audio file" | |
elif microphone_audio_file is not None: | |
audio_file = microphone_audio_file | |
else: | |
audio_file = uploaded_audio_file | |
audio_data = process_audio_file(audio_file) | |
# text = pipe(audio_data)["text"] | |
text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"] | |
logger.info(f"Transcription for {audio_file}: {text}") | |
return warning_message + text | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True), | |
gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True), | |
], | |
outputs="text", | |
layout="horizontal", | |
# theme="huggingface", | |
title="Speech-to-Text in French", | |
description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and π€ Transformers to transcribe audio files of arbitrary length.", | |
allow_flagging="never", | |
) | |
# iface.launch(server_name="0.0.0.0", debug=True, share=True) | |
iface.launch(enable_queue=True) | |