Evaluate ASR models

This is a breakdown of the steps to evaluate ASR models on a small subset of the Librispeech dataset based on the script in the evaluate_asr.py file.

0. Import the necessary libraries

from datasets import load_dataset, Dataset
from transformers import pipeline
import evaluate
import torch
import numpy as np
from tqdm import tqdm
import gradio as gr
from collections import defaultdict
import json

1. Pick a speech dataset (English)from the Hugging Face hub and create a small subset of this dataset (100 rows) by streaming the data

We will use the librispeech_asr dataset from the Hugging Face hub. We will use the clean split and the validation subset.

# Load data
ds = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True)
ds = ds.take(100)

3. Pick three transformers-compatible speech recognition models

We will evaluate the following models:

openai/whisper-tiny.en
facebook/wav2vec2-base-960h
distil-whisper/distil-small.en

model_name = {
    "whisper-tiny": "openai/whisper-tiny.en",
    "wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
    "distill-whisper-small": "distil-whisper/distil-small.en",
}

4. Evaluate the models on the dataset

def evaluate_model(ds, pipe, wer_metric):
    wer_scores = []
    wer_results = []
    for idx, sample in enumerate(tqdm(ds, desc="Evaluating", total=len(list(ds)))):
        audio_sample = sample["audio"]
        transcription = pipe(audio_sample["array"])['text']
        # Keep only letter and spaces for evaluation
        transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
        wer = wer_metric.compute(predictions=[transcription.upper()], references=[sample["text"].upper()])
        wer_scores.append(wer)
        wer_results.append({
            "index": idx,
            "transcription": transcription.upper(),
            "reference": sample["text"].upper(),
            "wer": wer
        })
    return wer_scores, wer_results

# Load WER metric
wer_metric = evaluate.load("wer")

results = {}
model_wer_results = {}
# Evaluate model
for model in model_name:
    pipe = pipeline("automatic-speech-recognition", model=model_name[model])
    wer_scores, wer_results = evaluate_model(ds, pipe, wer_metric)
    results[model] = np.mean(wer_scores)
    model_wer_results[model] = wer_results

for model in results:
    print(f"Model: {model}, WER: {results[model]}")