ardaatahan's picture
explicitly checkout to main to get latest updates
d72a113
raw
history blame
17.2 kB
import json
import os
import re
from datetime import datetime
from typing import Tuple
import pandas as pd
from bs4 import BeautifulSoup
def format_datetime(dt_str: str) -> str:
"""
Format a datetime string for display.
:param dt_str: String representing a datetime in ISO format
:return: Formatted datetime string
"""
return dt_str.replace("T", " ").split("+")[0]
def read_json_line_by_line(file_path):
"""
Read a JSON file line by line, parsing each line as a separate JSON object.
:param file_path: Path to the JSON file
:return: List of parsed JSON objects
This function is useful for reading large JSON files that contain one JSON object
per line. It handles JSON parsing errors gracefully, skipping invalid lines.
"""
data = []
with open(file_path, "r") as f:
for line in f:
try:
item = json.loads(line.strip())
data.append(item)
except json.JSONDecodeError:
print(f"Skipping invalid JSON in {file_path}: {line}")
return data
def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]:
"""Calculate percentage change and return with appropriate emoji."""
pct_change = new - old
if abs(pct_change) < 1:
emoji = "↔️"
elif pct_change > 0:
emoji = "🟒" if "wer" not in metric_name.lower() else "❌"
else:
emoji = "❌" if "wer" not in metric_name.lower() else "🟒"
return (pct_change, emoji)
def has_changes(config, prev_dict, curr_dict):
"""Check if any metrics have changed."""
curr = curr_dict[config]
prev = prev_dict[config]
metrics = ["speed", "tokens_per_second", "average_wer", "qoi"]
for key in metrics:
if key in curr and key in prev:
curr_val = curr[key]
prev_val = prev[key]
if abs(curr_val - prev_val) >= 1: # 1% threshold
return True
return False
def format_metrics_table(config, prev_dict, curr_dict):
"""Format metrics into a table string."""
curr = curr_dict[config]
prev = prev_dict[config]
metrics = [
("Speed", "speed"),
("Tok/s", "tokens_per_second"),
("WER", "average_wer"),
("QoI", "qoi"),
]
table = "```\nMetric Previous Current Change\n--------------------------------\n"
for metric_name, key in metrics:
if key in curr and key in prev:
curr_val = curr[key]
prev_val = prev[key]
pct_change, _ = calculate_change(curr_val, prev_val, metric_name)
if abs(pct_change) >= 1: # Only show metrics with changes
table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n"
table += "```"
return table
def extract_status_and_os(cell_value):
"""
Extract status and OS versions from a cell, handling both HTML and plain text.
Returns list of tuples: [(status, os_version), ...]
"""
results = []
cell_value = str(cell_value)
# First, handle the case where there's no HTML tags
if cell_value == "Not Supported":
return results
# Split the cell into parts (first element and subsequent <p> elements)
parts = cell_value.split("<p>")
for part in parts:
part = part.strip("</p>")
if not part:
continue
# Check if part contains warning symbol
if "⚠️" in part:
# Parse HTML to extract OS version from anchor tag
soup = BeautifulSoup(part, "html.parser")
# Find text after href that contains OS version
text = soup.get_text()
os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", text)
if os_match:
os_version = os_match.group(0)
results.append(("⚠️", os_version))
else:
# For success cases, OS version is directly in the text
os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", part)
if os_match:
os_version = os_match.group(0)
results.append(("βœ…", os_version))
return results
def escape_string(s: str) -> str:
"""Escape a string to be used as a value in JSON."""
return (
s.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
)
def analyze_support_changes(prev_csv, curr_csv):
"""Analyze support changes between CSV files."""
# Read CSV files
prev_df = pd.read_csv(prev_csv)
prev_df.set_index(prev_df.columns[0], inplace=True)
curr_df = pd.read_csv(curr_csv)
curr_df.set_index(curr_df.columns[0], inplace=True)
# Get device lists (excluding first column which is the index)
prev_devices = sorted(prev_df.columns[1:])
curr_devices = sorted(curr_df.columns[1:])
# Calculate device ratio
device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1
needs_alert = device_ratio < 0.9 # Alert if less than 90% of previous devices
# Convert to dictionary for easier comparison
prev_status = {}
curr_status = {}
# Process previous data
for idx in range(len(prev_df)):
model = prev_df.index[idx]
for col_idx in range(1, len(prev_df.columns)):
cell_value = prev_df.iloc[idx, col_idx]
device = prev_df.columns[col_idx]
statuses = extract_status_and_os(cell_value)
for status, os_version in statuses:
prev_status[(model, device, os_version)] = status
# Process current data and track new configurations
new_configs = []
for idx in range(len(curr_df)):
model = curr_df.index[idx]
for col_idx in range(1, len(curr_df.columns)):
cell_value = curr_df.iloc[idx, col_idx]
device = curr_df.columns[col_idx]
statuses = extract_status_and_os(cell_value)
for status, os_version in statuses:
curr_status[(model, device, os_version)] = status
# Check if this is a new configuration
if (model, device, os_version) not in prev_status:
new_configs.append((model, device, os_version))
# Find changes
fixed_errors = []
new_errors = []
# Check all configurations that exist in both datasets
common_configs = set(prev_status.keys()) & set(curr_status.keys())
for config in common_configs:
model, device, os_version = config
if prev_status[config] == "⚠️" and curr_status[config] == "βœ…":
fixed_errors.append((model, device, os_version))
elif prev_status[config] == "βœ…" and curr_status[config] == "⚠️":
new_errors.append((model, device, os_version))
return fixed_errors, new_errors, new_configs, needs_alert
def generate_report():
# Load current and previous data
prev_perf_data = read_json_line_by_line("report_data/performance_data.json")
curr_perf_data = read_json_line_by_line("dashboard_data/performance_data.json")
prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data}
curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data}
common_configs = set(curr_dict.keys()) & set(prev_dict.keys())
# Load version data
with open("report_data/version.json", "r") as f:
prev_version = json.load(f)
with open("dashboard_data/version.json", "r") as f:
curr_version = json.load(f)
prev_releases = set(prev_version.get("releases", []))
curr_releases = set(curr_version.get("releases", []))
new_releases = curr_releases - prev_releases
removed_releases = prev_releases - curr_releases
# Track metrics
total_configs = len(common_configs)
improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys()))
# Analyze support changes
fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes(
"report_data/support_data.csv", "dashboard_data/support_data.csv"
)
# Create Slack blocks
current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S")
prev_release_tag, curr_release_tag = (
prev_version["whisperkit_version"],
curr_version["whisperkit_version"],
)
slack_blocks = {
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "πŸ”” WhisperKit Dataset Update Report πŸ””",
"emoji": True,
},
},
{
"type": "context",
"elements": [{"text": f"*{current_time}*", "type": "mrkdwn"}],
},
{"type": "divider"},
{
"type": "section",
"text": {"type": "mrkdwn", "text": "ℹ️ *CURRENT VERSION INFO* ℹ️"},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Last Modified:* `{format_datetime(curr_version['last_modified'])}`",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Dataset SHA:* `{curr_version['sha']}`",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Current Releases:* {', '.join(f'`{r}`' for r in curr_version['releases'])}",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Current Release Tag:* `{curr_release_tag}`",
},
},
{"type": "divider"},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "πŸ”„ *SUMMARY OF PERFORMANCE UPDATES* πŸ”„",
},
},
]
}
# Add release information
slack_blocks["blocks"].extend(
[
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Added Releases:* {', '.join(sorted(new_releases)) if new_releases else 'None'}",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Removed Releases:* {', '.join(sorted(removed_releases)) if removed_releases else 'None'}",
},
},
]
)
if prev_release_tag != curr_release_tag:
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *Release Tag Change:* `{prev_release_tag}` β†’ `{curr_release_tag}`",
},
}
)
slack_blocks["blocks"].extend(
[
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "\n",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *New Data Points:* `{new_data_points}` new configurations",
},
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "\n",
},
},
]
)
# Add metrics summary
for metric_name, key in [
("Speed", "speed"),
("Tok/s", "tokens_per_second"),
("WER", "average_wer"),
("QoI", "qoi"),
]:
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ *{metric_name}:* `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed",
},
}
)
# Add support changes section
if fixed_errors or new_errors or new_configs:
slack_blocks["blocks"].extend(
[
{"type": "divider"},
{
"type": "section",
"text": {"type": "mrkdwn", "text": "πŸ“± *DEVICE SUPPORT CHANGES* πŸ“±"},
},
]
)
if fixed_errors:
slack_blocks["blocks"].extend(
[
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Successful Configurations That Override Previous Failures*",
},
}
]
)
for model, device, os_version in sorted(fixed_errors):
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ {model} on {device} ({os_version})",
},
}
)
if new_errors:
slack_blocks["blocks"].extend(
[
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Failed Configurations That Override Previous Successes*",
},
}
]
)
for model, device, os_version in sorted(new_errors):
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ {model} on {device} ({os_version})",
},
}
)
if new_configs:
slack_blocks["blocks"].extend(
[
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Newly Tested Configurations*",
},
}
]
)
for model, device, os_version in sorted(new_configs):
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"β€’ {model} on {device} ({os_version})",
},
}
)
# Add alert if significant decrease in device count
if needs_alert:
slack_blocks["blocks"].append(
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "⚠️ *ALERT:* Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!",
},
}
)
# Create performance text as a single mrkdwn string
if common_configs:
performance_text = "πŸ’‘ *Performance Updates* πŸ’‘\n\n"
# Group by model for better organization
models = sorted(set(model for model, _, _ in common_configs))
for model in models:
model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model])
for config in model_configs:
device_info = f"*{model}* ({config[2]})"
if not has_changes(config, prev_dict, curr_dict):
# If no changes, just add the model with a checkmark
performance_text += f"{device_info} βœ…\n\n"
else:
# If there are changes, show the metrics
performance_text += f"{device_info}\n"
performance_text += format_metrics_table(
config, prev_dict, curr_dict
)
performance_text += "\n\n"
# Write to GITHUB_OUTPUT
github_output = os.getenv("GITHUB_OUTPUT")
if github_output:
with open(github_output, "a") as f:
f.write("slack_message_payload<<EOF\n")
json.dump(slack_blocks, f, indent=2)
f.write("\nEOF\n")
with open(github_output, "a") as f:
escaped_text = escape_string(performance_text)
print(f"performance_message={escaped_text}", file=f)
if __name__ == "__main__":
generate_report()