Spaces:
Runtime error
Runtime error
File size: 2,864 Bytes
a356f8e e927cf5 a356f8e 76e0282 a356f8e e927cf5 a356f8e e927cf5 a356f8e 7529d20 a356f8e 76e0282 a356f8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import logging
import warnings
import gradio as gr
import librosa
# import torchaudio
from transformers import pipeline
from transformers.utils.logging import disable_progress_bar
warnings.filterwarnings("ignore")
disable_progress_bar()
logging.basicConfig(
format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
datefmt="%Y-%m-%dT%H:%M:%SZ",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french"
SAMPLE_RATE = 16_000
pipe = pipeline(model=MODEL_NAME)
logger.info("ASR pipeline has been initialized")
def process_audio_file(audio_file):
# waveform, sample_rate = torchaudio.load(audio_file)
# waveform = waveform.squeeze(axis=0) # mono
# # resample
# if sample_rate != SAMPLE_RATE:
# resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
# waveform = resampler(waveform)
waveform, sample_rate = librosa.load(audio_file, mono=True)
# resample
if sample_rate != SAMPLE_RATE:
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
return waveform
def transcribe(microphone_audio_file, uploaded_audio_file):
warning_message = ""
if (microphone_audio_file is not None) and (uploaded_audio_file is not None):
warning_message = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
audio_file = microphone_audio_file
elif (microphone_audio_file is None) and (uploaded_audio_file is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone_audio_file is not None:
audio_file = microphone_audio_file
else:
audio_file = uploaded_audio_file
audio_data = process_audio_file(audio_file)
# text = pipe(audio_data)["text"]
text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"]
logger.info(f"Transcription for {audio_file}: {text}")
return warning_message + text
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True),
],
outputs="text",
layout="horizontal",
# theme="huggingface",
title="Speech-to-Text in French",
description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and π€ Transformers to transcribe audio files of arbitrary length.",
allow_flagging="never",
)
# iface.launch(server_name="0.0.0.0", debug=True, share=True)
iface.launch(enable_queue=True)
|