Spaces:
Running
Running
File size: 3,535 Bytes
ed6583e 347882e ed6583e 6ee85be ed6583e 347882e ed6583e bd81ec2 ed6583e b293ec6 347882e ed6583e bd81ec2 ed6583e 66cbb93 ed6583e 0f94bf5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
from datasets import load_dataset, Dataset
from transformers import pipeline
import evaluate
import numpy as np
import gradio as gr
import json
from pathlib import Path
import re
# Load WER metric
wer_metric = evaluate.load("wer")
model_name = {
"whisper-tiny": "openai/whisper-tiny.en",
"wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
"distill-whisper-small": "distil-whisper/distil-small.en",
}
# open ds_data.json
with open("ds_data.json", "r") as f:
table_data = json.load(f)
def clean_text(text):
return re.sub(r'[.,!?]', '', text)
def compute_wer_table(audio, text):
# Convert the wav into an array
audio_input = audio[1]
audio_input = audio_input.astype(np.float32)
audio_input = audio_input / 32767
trans = []
wer_scores = []
remove_chars = str.maketrans('', '', '.,!?')
for model in model_name:
pipe = pipeline("automatic-speech-recognition", model=model_name[model])
transcription = pipe(audio_input)['text']
# transcription = transcription.translate(remove_chars)
transcription = clean_text(transcription)
trans.append(transcription)
wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
wer_scores.append(wer)
result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)]
return result
with gr.Blocks() as demo:
with gr.Tab("Docs"):
gr.Markdown((Path(__file__).parent / "demo.md").read_text())
with gr.Tab("Demo"):
gr.Interface(
fn=compute_wer_table,
inputs=[
gr.Audio(label="Input Audio"),
gr.Textbox(label="Reference Text")
],
outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"),
examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)],
title="ASR Model Evaluation",
description=(
"This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on "
"a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute "
"the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, "
"the transcribed text, and the calculated WER. "
"\n\n### Table of Results\n"
"The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. "
"Lower WER scores indicate better performance."
"\n\n| Model | WER |\n"
"|--------------------------|--------------------------|\n"
"| [whisper-tiny](https://huggingface.co./openai/whisper-tiny.en) | 0.05511 |\n"
"| [wav2vec2-large-960h](https://huggingface.co./facebook/wav2vec2-large-960h) | 0.01617 |\n"
"| [distill-whisper-small](https://huggingface.co./distil-whisper/distil-small.en)| 0.03686 |\n"
"\n\n### Data Source\n"
"The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co./datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
),
)
demo.launch()
|