Spaces:

Steveeeeeeen
/

ASR-comparaison

Running

App Files Files Community

Steven Zheng commited on Aug 12, 2024

Commit

ed6583e

1 Parent(s): 461492b

Add application file

Browse files

Files changed (1) hide show

app.py +73 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from datasets import load_dataset, Dataset
+from transformers import pipeline
+import evaluate
+import numpy as np
+import gradio as gr
+import json
+from pathlib import Path
+# Load WER metric
+wer_metric = evaluate.load("wer")
+model_name = {
+    "whisper-tiny": "openai/whisper-tiny.en",
+    "wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
+    "distill-whisper-small": "distil-whisper/distil-small.en",
+}
+# open ds_data.json
+with open("models/ds_data.json", "r") as f:
+    table_data = json.load(f)
+def compute_wer_table(audio, text):
+    # Convert the wav into an array
+    audio_input = audio[1]
+    audio_input = audio_input.astype(np.float32)
+    audio_input = audio_input / 32767
+    trans = []
+    wer_scores = []
+    for model in model_name:
+        pipe = pipeline("automatic-speech-recognition", model=model_name[model])
+        transcription = pipe(audio_input)['text']
+        transcription = transcription.replace(",", "").replace(".", "")
+        trans.append(transcription)
+        wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
+        wer_scores.append(wer)
+    result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)]
+    return result
+with gr.Blocks() as demo:
+    with gr.Tab("Docs"):
+        gr.Markdown((Path(__file__).parent / "demo.md").read_text())
+    with gr.Tab("Demo"):
+        gr.Interface(
+            fn=compute_wer_table,
+            inputs=[
+                gr.Audio(label="Input Audio"),
+                gr.Textbox(label="Reference Text")
+            ],
+            outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"),
+            examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)],
+            title="ASR Model Evaluation",
+            description=(
+                "This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on "
+                "a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute "
+                "the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, "
+                "the transcribed text, and the calculated WER. "
+                "\n\n### Table of Results\n"
+                "The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. "
+                "Lower WER scores indicate better performance."
+                "\n\n| Model                   | WER                     |\n"
+                "|--------------------------|--------------------------|\n"
+                "| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en)         | 0.06175      |\n"
+                "| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h)  | 0.01617     |\n"
+                "| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.04350      |\n"
+                "\n\n### Data Source\n"
+                "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
+            ),
+        )
+demo.launch(share=True)