import json import os import shutil import sys from collections import defaultdict from statistics import mean import pandas as pd import requests from constants import BASE_WHISPERKIT_BENCHMARK_URL from text_normalizer import text_normalizer from utils import compute_average_wer, download_dataset def fetch_evaluation_data(url): """ Fetches evaluation data from the given URL. :param url: The URL to fetch the evaluation data from. :returns: The evaluation data as a dictionary. :rauses: sys.exit if the request fails """ response = requests.get(url) if response.status_code == 200: return json.loads(response.text) else: sys.exit(f"Failed to fetch WhisperKit evals: {response.text}") def process_benchmark_file(file_path, dataset_dfs, device_map, results): """ Processes a single benchmark file and updates the results dictionary. :param file_path: Path to the benchmark JSON file. :param dataset_dfs: Dictionary of DataFrames containing dataset information. :param results: Dictionary to store the processed results. This function reads a benchmark JSON file, extracts relevant information, and updates the results dictionary with various metrics including WER, speed, tokens per second, and quality of inference (QoI). """ with open(file_path, "r") as file: test_results = json.load(file) if len(test_results) == 0: return commit_hash_timestamp = file_path.split("/")[-2] commit_timestamp, commit_hash = commit_hash_timestamp.split("_") first_test_result = test_results[0] if first_test_result is None: return filename = file_path.split("/")[-1].strip(".json") device, company, model, dataset_dir, timestamp = filename.split("_") model = f"{company}_{model}" if device not in device_map: return device = device_map[device] os_info = first_test_result["staticAttributes"]["os"] key = (model, device, os_info, commit_timestamp) dataset_name = dataset_dir for test_result in test_results: if test_result is None: continue test_info = test_result["testInfo"] audio_file_name = test_info["audioFile"] dataset_df = dataset_dfs[dataset_name] wer_entry = { "prediction": text_normalizer(test_info["prediction"]), "reference": text_normalizer(test_info["reference"]), } results[key]["timestamp"] = timestamp results[key]["average_wer"].append(wer_entry) input_audio_seconds = test_info["timings"]["inputAudioSeconds"] full_pipeline = test_info["timings"]["fullPipeline"] / 1000 time_elapsed = test_result["latencyStats"]["measurements"]["timeElapsed"] total_decoding_loops = test_info["timings"]["totalDecodingLoops"] results[key]["dataset_speed"][dataset_name][ "inputAudioSeconds" ] += input_audio_seconds results[key]["dataset_speed"][dataset_name]["fullPipeline"] += full_pipeline results[key]["speed"]["inputAudioSeconds"] += input_audio_seconds results[key]["speed"]["fullPipeline"] += full_pipeline results[key]["commit_hash"] = commit_hash results[key]["commit_timestamp"] = commit_timestamp results[key]["dataset_tokens_per_second"][dataset_name][ "totalDecodingLoops" ] += total_decoding_loops results[key]["dataset_tokens_per_second"][dataset_name][ "timeElapsed" ] += time_elapsed results[key]["tokens_per_second"]["totalDecodingLoops"] += total_decoding_loops results[key]["tokens_per_second"]["timeElapsed"] += time_elapsed audio = audio_file_name.split(".")[0] audio = audio.split("-")[0] dataset_row = dataset_df.loc[dataset_df["file"].str.contains(audio)].iloc[0] reference_wer = dataset_row["wer"] prediction_wer = test_info["wer"] results[key]["qoi"].append(1 if prediction_wer <= reference_wer * 110 else 0) def calculate_and_save_performance_results( performance_results, performance_output_path ): """ Calculates final performance metrics and saves them to a JSON file. :param performance_results: Dictionary containing raw performance data. :param performance_output_path: Path to save the processed performance results. This function processes the raw performance data, calculates average metrics, and writes the final results to a JSON file, with each entry representing a unique combination of model, device, and OS. """ not_supported = [] with open(performance_output_path, "w") as performance_file: for key, data in performance_results.items(): model, device, os_info, timestamp = key speed = round( data["speed"]["inputAudioSeconds"] / data["speed"]["fullPipeline"], 2 ) # if speed < 1.0: # not_supported.append((model, device, os_info)) # continue performance_entry = { "model": model.replace("_", "/"), "device": device, "os": os_info.replace("_", " "), "timestamp": data["timestamp"], "speed": speed, "tokens_per_second": round( data["tokens_per_second"]["totalDecodingLoops"] / data["tokens_per_second"]["timeElapsed"], 2, ), "dataset_speed": { dataset: round( speed_info["inputAudioSeconds"] / speed_info["fullPipeline"], 2 ) for dataset, speed_info in data["dataset_speed"].items() }, "dataset_tokens_per_second": { dataset: round( tps_info["totalDecodingLoops"] / tps_info["timeElapsed"], 2 ) for dataset, tps_info in data["dataset_tokens_per_second"].items() }, "average_wer": compute_average_wer(data["average_wer"]), "qoi": round(mean(data["qoi"]), 2), "commit_hash": data["commit_hash"], "commit_timestamp": data["commit_timestamp"], } json.dump(performance_entry, performance_file) performance_file.write("\n") return not_supported def generate_support_matrix(performance_data_path="dashboard_data/performance_data.json", output_file="dashboard_data/support_data.csv"): """ Generate a support matrix CSV showing model compatibility across devices and OS versions. ✅: All tests passed ⚠️: Some tests failed """ support_matrix = defaultdict(lambda: defaultdict(lambda: { "os_versions": set(), "dataset_count": 0 })) models = set() devices = set() # Process performance data with open(performance_data_path, 'r') as f: for line in f: entry = json.loads(line) model = entry["model"] device = entry["device"] os_info = entry["os"] models.add(model) devices.add(device) support_matrix[model][device]["os_versions"].add(os_info) if "dataset_speed" in entry: support_matrix[model][device]["dataset_count"] = len(entry["dataset_speed"]) # Create DataFrame with correct headers df = pd.DataFrame(columns=['', 'Model'] + [f'"{device}"' for device in sorted(devices)]) # Add each model with its data for model in sorted(models): row_data = {'': model, 'Model': model} for device in sorted(devices): info = support_matrix[model].get(device, {"dataset_count": 0, "os_versions": set()}) os_versions = ', '.join(sorted(info["os_versions"])) if info["dataset_count"] == 0: row_data[f'"{device}"'] = "Not Supported" elif info["dataset_count"] >= 2: row_data[f'"{device}"'] = f"✅ {os_versions}" else: row_data[f'"{device}"'] = f"⚠️ {os_versions}" df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True) # Save to CSV df.to_csv(output_file, index=False) def main(): """ Main function to orchestrate the performance data generation process. This function performs the following steps: 1. Downloads benchmark data if requested. 2. Fetches evaluation data for various datasets. 3. Processes benchmark files and summary files. 4. Calculates and saves performance and support results. """ source_xcresult_repo = "argmaxinc/whisperkit-evals-dataset" source_xcresult_subfolder = "benchmark_data/" source_xcresult_directory = f"{source_xcresult_repo}/{source_xcresult_subfolder}" if len(sys.argv) > 1 and sys.argv[1] == "download": try: shutil.rmtree(source_xcresult_repo) except: print("Nothing to remove.") download_dataset( source_xcresult_repo, source_xcresult_repo, source_xcresult_subfolder ) datasets = { "Earnings-22": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", "LibriSpeech": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", "earnings22-10mins": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", "librispeech-10mins": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", "earnings22-12hours": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", "librispeech": "https://huggingface.co./datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", } dataset_dfs = {} for dataset_name, url in datasets.items(): evals = fetch_evaluation_data(url) dataset_dfs[dataset_name] = pd.json_normalize(evals["results"]) performance_results = defaultdict( lambda: { "average_wer": [], "qoi": [], "speed": {"inputAudioSeconds": 0, "fullPipeline": 0}, "tokens_per_second": {"totalDecodingLoops": 0, "timeElapsed": 0}, "dataset_speed": defaultdict( lambda: {"inputAudioSeconds": 0, "fullPipeline": 0} ), "dataset_tokens_per_second": defaultdict( lambda: {"totalDecodingLoops": 0, "timeElapsed": 0} ), "timestamp": None, "commit_hash": None, "commit_timestamp": None, "test_timestamp": None, } ) with open("dashboard_data/device_map.json", "r") as f: device_map = json.load(f) for subdir, _, files in os.walk(source_xcresult_directory): for filename in files: file_path = os.path.join(subdir, filename) if not filename.endswith(".json"): continue else: process_benchmark_file(file_path, dataset_dfs, device_map, performance_results) calculate_and_save_performance_results( performance_results, "dashboard_data/performance_data.json" ) generate_support_matrix() if __name__ == "__main__": main()