Spaces:

argmaxinc
/

whisperkit-benchmarks

Running

File size: 17,231 Bytes

e48391e

import json
import os
import re
from datetime import datetime
from typing import Tuple

import pandas as pd
from bs4 import BeautifulSoup


def format_datetime(dt_str: str) -> str:
    """
    Format a datetime string for display.

    :param dt_str: String representing a datetime in ISO format
    :return: Formatted datetime string
    """
    return dt_str.replace("T", " ").split("+")[0]


def read_json_line_by_line(file_path):
    """
    Read a JSON file line by line, parsing each line as a separate JSON object.

    :param file_path: Path to the JSON file
    :return: List of parsed JSON objects

    This function is useful for reading large JSON files that contain one JSON object
    per line. It handles JSON parsing errors gracefully, skipping invalid lines.
    """
    data = []
    with open(file_path, "r") as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                data.append(item)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON in {file_path}: {line}")
    return data


def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]:
    """Calculate percentage change and return with appropriate emoji."""
    pct_change = new - old
    if abs(pct_change) < 1:
        emoji = "↔️"
    elif pct_change > 0:
        emoji = "🟢" if "wer" not in metric_name.lower() else "❌"
    else:
        emoji = "❌" if "wer" not in metric_name.lower() else "🟢"

    return (pct_change, emoji)


def has_changes(config, prev_dict, curr_dict):
    """Check if any metrics have changed."""
    curr = curr_dict[config]
    prev = prev_dict[config]

    metrics = ["speed", "tokens_per_second", "average_wer", "qoi"]
    for key in metrics:
        if key in curr and key in prev:
            curr_val = curr[key]
            prev_val = prev[key]
            if abs(curr_val - prev_val) >= 1:  # 1% threshold
                return True
    return False


def format_metrics_table(config, prev_dict, curr_dict):
    """Format metrics into a table string."""
    curr = curr_dict[config]
    prev = prev_dict[config]

    metrics = [
        ("Speed", "speed"),
        ("Tok/s", "tokens_per_second"),
        ("WER", "average_wer"),
        ("QoI", "qoi"),
    ]

    table = "```\nMetric    Previous    Current    Change\n--------------------------------\n"
    for metric_name, key in metrics:
        if key in curr and key in prev:
            curr_val = curr[key]
            prev_val = prev[key]
            pct_change, _ = calculate_change(curr_val, prev_val, metric_name)
            if abs(pct_change) >= 1:  # Only show metrics with changes
                table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n"
    table += "```"
    return table


def extract_status_and_os(cell_value):
    """
    Extract status and OS versions from a cell, handling both HTML and plain text.
    Returns list of tuples: [(status, os_version), ...]
    """
    results = []
    cell_value = str(cell_value)

    # First, handle the case where there's no HTML tags
    if cell_value == "Not Supported":
        return results

    # Split the cell into parts (first element and subsequent <p> elements)
    parts = cell_value.split("<p>")

    for part in parts:
        part = part.strip("</p>")
        if not part:
            continue

        # Check if part contains warning symbol
        if "⚠️" in part:
            # Parse HTML to extract OS version from anchor tag
            soup = BeautifulSoup(part, "html.parser")
            # Find text after href that contains OS version
            text = soup.get_text()
            os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", text)
            if os_match:
                os_version = os_match.group(0)
                results.append(("⚠️", os_version))
        else:
            # For success cases, OS version is directly in the text
            os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", part)
            if os_match:
                os_version = os_match.group(0)
                results.append(("✅", os_version))

    return results


def escape_string(s: str) -> str:
    """Escape a string to be used as a value in JSON."""
    return (
        s.replace("\\", "\\\\")
        .replace('"', '\\"')
        .replace("\n", "\\n")
        .replace("\r", "\\r")
    )


def analyze_support_changes(prev_csv, curr_csv):
    """Analyze support changes between CSV files."""
    # Read CSV files
    prev_df = pd.read_csv(prev_csv)
    prev_df.set_index(prev_df.columns[0], inplace=True)

    curr_df = pd.read_csv(curr_csv)
    curr_df.set_index(curr_df.columns[0], inplace=True)

    # Get device lists (excluding first column which is the index)
    prev_devices = sorted(prev_df.columns[1:])
    curr_devices = sorted(curr_df.columns[1:])

    # Calculate device ratio
    device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1
    needs_alert = device_ratio < 0.9  # Alert if less than 90% of previous devices

    # Convert to dictionary for easier comparison
    prev_status = {}
    curr_status = {}

    # Process previous data
    for idx in range(len(prev_df)):
        model = prev_df.index[idx]
        for col_idx in range(1, len(prev_df.columns)):
            cell_value = prev_df.iloc[idx, col_idx]
            device = prev_df.columns[col_idx]
            statuses = extract_status_and_os(cell_value)
            for status, os_version in statuses:
                prev_status[(model, device, os_version)] = status

    # Process current data and track new configurations
    new_configs = []
    for idx in range(len(curr_df)):
        model = curr_df.index[idx]
        for col_idx in range(1, len(curr_df.columns)):
            cell_value = curr_df.iloc[idx, col_idx]
            device = curr_df.columns[col_idx]
            statuses = extract_status_and_os(cell_value)
            for status, os_version in statuses:
                curr_status[(model, device, os_version)] = status
                # Check if this is a new configuration
                if (model, device, os_version) not in prev_status:
                    new_configs.append((model, device, os_version))

    # Find changes
    fixed_errors = []
    new_errors = []

    # Check all configurations that exist in both datasets
    common_configs = set(prev_status.keys()) & set(curr_status.keys())
    for config in common_configs:
        model, device, os_version = config
        if prev_status[config] == "⚠️" and curr_status[config] == "✅":
            fixed_errors.append((model, device, os_version))
        elif prev_status[config] == "✅" and curr_status[config] == "⚠️":
            new_errors.append((model, device, os_version))

    return fixed_errors, new_errors, new_configs, needs_alert


def generate_report():
    # Load current and previous data
    prev_perf_data = read_json_line_by_line("report_data/performance_data.json")
    curr_perf_data = read_json_line_by_line("dashboard_data/performance_data.json")

    prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data}
    curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data}
    common_configs = set(curr_dict.keys()) & set(prev_dict.keys())

    # Load version data
    with open("report_data/version.json", "r") as f:
        prev_version = json.load(f)
    with open("dashboard_data/version.json", "r") as f:
        curr_version = json.load(f)

    prev_releases = set(prev_version.get("releases", []))
    curr_releases = set(curr_version.get("releases", []))
    new_releases = curr_releases - prev_releases
    removed_releases = prev_releases - curr_releases

    # Track metrics
    total_configs = len(common_configs)
    improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
    regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
    new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys()))

    # Analyze support changes
    fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes(
        "report_data/support_data.csv", "dashboard_data/support_data.csv"
    )

    # Create Slack blocks
    current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S")
    prev_release_tag, curr_release_tag = (
        prev_version["whisperkit_version"],
        curr_version["whisperkit_version"],
    )
    slack_blocks = {
        "blocks": [
            {
                "type": "header",
                "text": {
                    "type": "plain_text",
                    "text": "🔔 WhisperKit Dataset Update Report 🔔",
                    "emoji": True,
                },
            },
            {
                "type": "context",
                "elements": [{"text": f"*{current_time}*", "type": "mrkdwn"}],
            },
            {"type": "divider"},
            {
                "type": "section",
                "text": {"type": "mrkdwn", "text": "ℹ️ *CURRENT VERSION INFO* ℹ️"},
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Last Modified:* `{format_datetime(curr_version['last_modified'])}`",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Dataset SHA:* `{curr_version['sha']}`",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Current Releases:* {', '.join(f'`{r}`' for r in curr_version['releases'])}",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Current Release Tag:* `{curr_release_tag}`",
                },
            },
            {"type": "divider"},
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "🔄 *SUMMARY OF PERFORMANCE UPDATES* 🔄",
                },
            },
        ]
    }

    # Add release information
    slack_blocks["blocks"].extend(
        [
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Added Releases:* {', '.join(sorted(new_releases)) if new_releases else 'None'}",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Removed Releases:* {', '.join(sorted(removed_releases)) if removed_releases else 'None'}",
                },
            },
        ]
    )
    if prev_release_tag != curr_release_tag:
        slack_blocks["blocks"].append(
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *Release Tag Change:* `{prev_release_tag}` → `{curr_release_tag}`",
                },
            }
        )
    slack_blocks["blocks"].extend(
        [
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "\n",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *New Data Points:* `{new_data_points}` new configurations",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "\n",
                },
            },
        ]
    )

    # Add metrics summary
    for metric_name, key in [
        ("Speed", "speed"),
        ("Tok/s", "tokens_per_second"),
        ("WER", "average_wer"),
        ("QoI", "qoi"),
    ]:
        slack_blocks["blocks"].append(
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"• *{metric_name}:* `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed",
                },
            }
        )

    # Add support changes section
    if fixed_errors or new_errors or new_configs:
        slack_blocks["blocks"].extend(
            [
                {"type": "divider"},
                {
                    "type": "section",
                    "text": {"type": "mrkdwn", "text": "📱 *DEVICE SUPPORT CHANGES* 📱"},
                },
            ]
        )

        if fixed_errors:
            slack_blocks["blocks"].extend(
                [
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": "*Successful Configurations That Override Previous Failures*",
                        },
                    }
                ]
            )
            for model, device, os_version in sorted(fixed_errors):
                slack_blocks["blocks"].append(
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": f"• {model} on {device} ({os_version})",
                        },
                    }
                )

        if new_errors:
            slack_blocks["blocks"].extend(
                [
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": "*Failed Configurations That Override Previous Successes*",
                        },
                    }
                ]
            )
            for model, device, os_version in sorted(new_errors):
                slack_blocks["blocks"].append(
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": f"• {model} on {device} ({os_version})",
                        },
                    }
                )

        if new_configs:
            slack_blocks["blocks"].extend(
                [
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": "*Newly Tested Configurations*",
                        },
                    }
                ]
            )
            for model, device, os_version in sorted(new_configs):
                slack_blocks["blocks"].append(
                    {
                        "type": "section",
                        "text": {
                            "type": "mrkdwn",
                            "text": f"• {model} on {device} ({os_version})",
                        },
                    }
                )

    # Add alert if significant decrease in device count
    if needs_alert:
        slack_blocks["blocks"].append(
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "⚠️ *ALERT:* Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!",
                },
            }
        )

    # Create performance text as a single mrkdwn string
    if common_configs:
        performance_text = "💡 *Performance Updates* 💡\n\n"

        # Group by model for better organization
        models = sorted(set(model for model, _, _ in common_configs))

        for model in models:
            model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model])

            for config in model_configs:
                device_info = f"*{model}* ({config[2]})"

                if not has_changes(config, prev_dict, curr_dict):
                    # If no changes, just add the model with a checkmark
                    performance_text += f"{device_info} ✅\n\n"
                else:
                    # If there are changes, show the metrics
                    performance_text += f"{device_info}\n"
                    performance_text += format_metrics_table(
                        config, prev_dict, curr_dict
                    )
                    performance_text += "\n\n"

    # Write to GITHUB_OUTPUT
    github_output = os.getenv("GITHUB_OUTPUT")
    if github_output:
        with open(github_output, "a") as f:
            f.write("slack_message_payload<<EOF\n")
            json.dump(slack_blocks, f, indent=2)
            f.write("\nEOF\n")

        with open(github_output, "a") as f:
            escaped_text = escape_string(performance_text)
            print(f"performance_message={escaped_text}", file=f)


if __name__ == "__main__":
    generate_report()