Spaces:

Steveeeeeeen
/

ASR-comparaison

Running

App Files Files Community

ASR-comparaison / app.py

Steveeeeeeen HF staff

Update app.py

347882e verified 6 months ago

raw

history blame

3.54 kB

	from datasets import load_dataset, Dataset
	from transformers import pipeline
	import evaluate
	import numpy as np
	import gradio as gr
	import json
	from pathlib import Path
	import re


	# Load WER metric
	wer_metric = evaluate.load("wer")

	model_name = {
	"whisper-tiny": "openai/whisper-tiny.en",
	"wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
	"distill-whisper-small": "distil-whisper/distil-small.en",
	}

	# open ds_data.json
	with open("ds_data.json", "r") as f:
	table_data = json.load(f)

	def clean_text(text):
	return re.sub(r'[.,!?]', '', text)

	def compute_wer_table(audio, text):
	# Convert the wav into an array
	audio_input = audio[1]
	audio_input = audio_input.astype(np.float32)
	audio_input = audio_input / 32767

	trans = []
	wer_scores = []
	remove_chars = str.maketrans('', '', '.,!?')

	for model in model_name:
	pipe = pipeline("automatic-speech-recognition", model=model_name[model])
	transcription = pipe(audio_input)['text']
	# transcription = transcription.translate(remove_chars)
	transcription = clean_text(transcription)
	trans.append(transcription)
	wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
	wer_scores.append(wer)

	result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)]

	return result

	with gr.Blocks() as demo:
	with gr.Tab("Docs"):
	gr.Markdown((Path(__file__).parent / "demo.md").read_text())
	with gr.Tab("Demo"):
	gr.Interface(
	fn=compute_wer_table,
	inputs=[
	gr.Audio(label="Input Audio"),
	gr.Textbox(label="Reference Text")
	],
	outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"),
	examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)],
	title="ASR Model Evaluation",
	description=(
	"This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on "
	"a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute "
	"the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, "
	"the transcribed text, and the calculated WER. "
	"\n\n### Table of Results\n"
	"The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. "
	"Lower WER scores indicate better performance."
	"\n\n\| Model \| WER \|\n"
	"\|--------------------------\|--------------------------\|\n"
	"\| [whisper-tiny](https://huggingface.co./openai/whisper-tiny.en) \| 0.05511 \|\n"
	"\| [wav2vec2-large-960h](https://huggingface.co./facebook/wav2vec2-large-960h) \| 0.01617 \|\n"
	"\| [distill-whisper-small](https://huggingface.co./distil-whisper/distil-small.en)\| 0.03686 \|\n"
	"\n\n### Data Source\n"
	"The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co./datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
	),
	)

	demo.launch()