File size: 3,535 Bytes
ed6583e
 
 
 
 
 
 
347882e
ed6583e
 
 
 
 
 
 
 
 
 
 
 
6ee85be
ed6583e
347882e
 
 
 
ed6583e
 
 
 
 
 
 
 
bd81ec2
 
ed6583e
 
 
b293ec6
347882e
ed6583e
 
 
 
 
 
 
bd81ec2
ed6583e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66cbb93
 
 
ed6583e
 
 
 
 
0f94bf5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from datasets import load_dataset, Dataset
from transformers import pipeline
import evaluate
import numpy as np
import gradio as gr
import json
from pathlib import Path
import re


# Load WER metric
wer_metric = evaluate.load("wer")

model_name = {
    "whisper-tiny": "openai/whisper-tiny.en",
    "wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
    "distill-whisper-small": "distil-whisper/distil-small.en",
}

# open ds_data.json
with open("ds_data.json", "r") as f:
    table_data = json.load(f)

def clean_text(text):
    return re.sub(r'[.,!?]', '', text)

def compute_wer_table(audio, text):
    # Convert the wav into an array
    audio_input = audio[1]
    audio_input = audio_input.astype(np.float32)
    audio_input = audio_input / 32767

    trans = []
    wer_scores = []
    remove_chars = str.maketrans('', '', '.,!?')
    
    for model in model_name:
        pipe = pipeline("automatic-speech-recognition", model=model_name[model])
        transcription = pipe(audio_input)['text']
        # transcription = transcription.translate(remove_chars)
        transcription = clean_text(transcription)
        trans.append(transcription)
        wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
        wer_scores.append(wer)
    
    result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)]

    return result

with gr.Blocks() as demo:
    with gr.Tab("Docs"):
        gr.Markdown((Path(__file__).parent / "demo.md").read_text())
    with gr.Tab("Demo"):
        gr.Interface(
            fn=compute_wer_table,
            inputs=[
                gr.Audio(label="Input Audio"),
                gr.Textbox(label="Reference Text")
            ],
            outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"),
            examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)],
            title="ASR Model Evaluation",
            description=(
                "This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on "
                "a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute "
                "the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, "
                "the transcribed text, and the calculated WER. "
                "\n\n### Table of Results\n"
                "The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. "
                "Lower WER scores indicate better performance."
                "\n\n| Model                   | WER                     |\n"
                "|--------------------------|--------------------------|\n"
                "| [whisper-tiny](https://huggingface.co./openai/whisper-tiny.en)         | 0.05511      |\n"
                "| [wav2vec2-large-960h](https://huggingface.co./facebook/wav2vec2-large-960h)  | 0.01617     |\n"
                "| [distill-whisper-small](https://huggingface.co./distil-whisper/distil-small.en)| 0.03686      |\n"
                "\n\n### Data Source\n"
                "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co./datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
            ),
        )

demo.launch()