Spaces:

bofenghuang
/

speech-to-text

Running

speech-to-text / run_demo.py

76e0282 almost 2 years ago

2.86 kB

	import logging
	import warnings

	import gradio as gr
	import librosa
	# import torchaudio
	from transformers import pipeline
	from transformers.utils.logging import disable_progress_bar

	warnings.filterwarnings("ignore")

	disable_progress_bar()

	logging.basicConfig(
	format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
	datefmt="%Y-%m-%dT%H:%M:%SZ",
	)
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)

	MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french"
	SAMPLE_RATE = 16_000

	pipe = pipeline(model=MODEL_NAME)
	logger.info("ASR pipeline has been initialized")


	def process_audio_file(audio_file):
	# waveform, sample_rate = torchaudio.load(audio_file)
	# waveform = waveform.squeeze(axis=0) # mono
	# # resample
	# if sample_rate != SAMPLE_RATE:
	# resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
	# waveform = resampler(waveform)

	waveform, sample_rate = librosa.load(audio_file, mono=True)
	# resample
	if sample_rate != SAMPLE_RATE:
	waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE)

	return waveform


	def transcribe(microphone_audio_file, uploaded_audio_file):
	warning_message = ""
	if (microphone_audio_file is not None) and (uploaded_audio_file is not None):
	warning_message = (
	"WARNING: You've uploaded an audio file and used the microphone. "
	"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
	)
	audio_file = microphone_audio_file
	elif (microphone_audio_file is None) and (uploaded_audio_file is None):
	return "ERROR: You have to either use the microphone or upload an audio file"
	elif microphone_audio_file is not None:
	audio_file = microphone_audio_file
	else:
	audio_file = uploaded_audio_file

	audio_data = process_audio_file(audio_file)

	# text = pipe(audio_data)["text"]
	text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"]
	logger.info(f"Transcription for {audio_file}: {text}")

	return warning_message + text


	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
	gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True),
	],
	outputs="text",
	layout="horizontal",
	# theme="huggingface",
	title="Speech-to-Text in French",
	description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length.",
	allow_flagging="never",
	)

	# iface.launch(server_name="0.0.0.0", debug=True, share=True)
	iface.launch(enable_queue=True)