import requests from bs4 import BeautifulSoup import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import gradio as gr import io import os import base64 import zipfile from PIL import Image from io import BytesIO import tempfile import sys # -------------------------------------------------------------------- # PART 1: TINY DATA + PLOTS # -------------------------------------------------------------------- # This dataframe is your “tiny” version of model performance data. # Used for plotting & demonstration in the Gradio app. data_full = [ ['CultriX/Qwen2.5-14B-SLERPv7', 'https://huggingface.co./CultriX/Qwen2.5-14B-SLERPv7', 0.7205, 0.8272, 0.7541, 0.6581, 0.5, 0.729], ['djuna/Q2.5-Veltha-14B-0.5', 'https://huggingface.co./djuna/Q2.5-Veltha-14B-0.5', 0.7492, 0.8386, 0.7305, 0.598, 0.43, 0.7817], ['CultriX/Qwen2.5-14B-FinalMerge', 'https://huggingface.co./CultriX/Qwen2.5-14B-FinalMerge', 0.7248, 0.8277, 0.7113, 0.7052, 0.57, 0.7001], ['CultriX/Qwen2.5-14B-MultiCultyv2', 'https://huggingface.co./CultriX/Qwen2.5-14B-MultiCultyv2', 0.7295, 0.8359, 0.7363, 0.5767, 0.44, 0.7316], ['CultriX/Qwen2.5-14B-Brocav7', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav7', 0.7445, 0.8353, 0.7508, 0.6292, 0.46, 0.7629], ['CultriX/Qwen2.5-14B-Broca', 'https://huggingface.co./CultriX/Qwen2.5-14B-Broca', 0.7456, 0.8352, 0.748, 0.6034, 0.44, 0.7716], ['CultriX/Qwen2.5-14B-Brocav3', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav3', 0.7395, 0.8388, 0.7393, 0.6405, 0.47, 0.7659], ['CultriX/Qwen2.5-14B-Brocav4', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav4', 0.7432, 0.8377, 0.7444, 0.6277, 0.48, 0.758], ['CultriX/Qwen2.5-14B-Brocav2', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav2', 0.7492, 0.8302, 0.7508, 0.6377, 0.51, 0.7478], ['CultriX/Qwen2.5-14B-Brocav5', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav5', 0.7445, 0.8313, 0.7547, 0.6376, 0.5, 0.7304], ['CultriX/Qwen2.5-14B-Brocav6', 'https://huggingface.co./CultriX/Qwen2.5-14B-Brocav6', 0.7179, 0.8354, 0.7531, 0.6378, 0.49, 0.7524], ['CultriX/Qwenfinity-2.5-14B', 'https://huggingface.co./CultriX/Qwenfinity-2.5-14B', 0.7347, 0.8254, 0.7279, 0.7267, 0.56, 0.697], ['CultriX/Qwen2.5-14B-Emergedv2', 'https://huggingface.co./CultriX/Qwen2.5-14B-Emergedv2', 0.7137, 0.8335, 0.7363, 0.5836, 0.44, 0.7344], ['CultriX/Qwen2.5-14B-Unity', 'https://huggingface.co./CultriX/Qwen2.5-14B-Unity', 0.7063, 0.8343, 0.7423, 0.682, 0.57, 0.7498], ['CultriX/Qwen2.5-14B-MultiCultyv3', 'https://huggingface.co./CultriX/Qwen2.5-14B-MultiCultyv3', 0.7132, 0.8216, 0.7395, 0.6792, 0.55, 0.712], ['CultriX/Qwen2.5-14B-Emergedv3', 'https://huggingface.co./CultriX/Qwen2.5-14B-Emergedv3', 0.7436, 0.8312, 0.7519, 0.6585, 0.55, 0.7068], ['CultriX/SeQwence-14Bv1', 'https://huggingface.co./CultriX/SeQwence-14Bv1', 0.7278, 0.841, 0.7541, 0.6816, 0.52, 0.7539], ['CultriX/Qwen2.5-14B-Wernickev2', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev2', 0.7391, 0.8168, 0.7273, 0.622, 0.45, 0.7572], ['CultriX/Qwen2.5-14B-Wernickev3', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev3', 0.7357, 0.8148, 0.7245, 0.7023, 0.55, 0.7869], ['CultriX/Qwen2.5-14B-Wernickev4', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev4', 0.7355, 0.829, 0.7497, 0.6306, 0.48, 0.7635], ['CultriX/SeQwential-14B-v1', 'https://huggingface.co./CultriX/SeQwential-14B-v1', 0.7355, 0.8205, 0.7549, 0.6367, 0.48, 0.7626], ['CultriX/Qwen2.5-14B-Wernickev5', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev5', 0.7224, 0.8272, 0.7541, 0.679, 0.51, 0.7578], ['CultriX/Qwen2.5-14B-Wernickev6', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev6', 0.6994, 0.7549, 0.5816, 0.6991, 0.58, 0.7267], ['CultriX/Qwen2.5-14B-Wernickev7', 'https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev7', 0.7147, 0.7599, 0.6097, 0.7056, 0.57, 0.7164], ['CultriX/Qwen2.5-14B-FinalMerge-tmp2', 'https://huggingface.co./CultriX/Qwen2.5-14B-FinalMerge-tmp2', 0.7255, 0.8192, 0.7535, 0.6671, 0.5, 0.7612], ['CultriX/Qwen2.5-14B-BrocaV8', 'https://huggingface.co./CultriX/Qwen2.5-14B-BrocaV8', 0.7415, 0.8396, 0.7334, 0.5785, 0.43, 0.7646], ['CultriX/Qwexit-2.5-14B-2024', 'https://huggingface.co./CultriX/Qwexit-2.5-14B-2024', 0.7253, 0.8174, 0.7456, 0.6688, 0.5300, 0.7027], ['CultriX/Qwen2.5-14B-BrocaV9', 'https://huggingface.co./CultriX/Qwen2.5-14B-BrocaV9', 0.7432, 0.8307, 0.7467, 0.6221, 0.5000, 0.7623], ['CultriX/Qwen2.5-14B-partialmergept1', 'https://huggingface.co./CultriX/Qwen2.5-14B-partialmergept1', 0.7389, 0.8370, 0.7451, 0.6715, 0.5700, 0.7308], ['CultriX/Qwen2.5-14B-partialmergept2', 'https://huggingface.co./CultriX/Qwen2.5-14B-partialmergept2', 0.7300, 0.8428, 0.7371, 0.5944, 0.4200, 0.7581], ['CultriX/model', 'https://huggingface.co./CultriX/model', 0.7010, 0.8320, 0.7194, 0.6158, 0.4700, 0.7385], ['CultriX/Qwen2.5-14B-BrocaFinal', 'https://huggingface.co./CultriX/Qwen2.5-14B-BrocaFinal', 0.6265, 0.7688, 0.7007, 0.7035, 0.5100, 0.7218], ['CultriX/Qwen2.5-14B-Hyperionv1', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv1', 0.7300, 0.8477, 0.7448, 0.6063, 0.4400, 0.7651], ['CultriX/Qwen2.5-14B-Hyperionv3', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv3', 0.7445, 0.8414, 0.7458, 0.6371, 0.4900, 0.7543], ['sometimesanotion/Lamarck-14B-v0.6', 'https://hf.xwall.us.kg.m/sometimesanotion/Lamarck-14B-v0.6', 0.7446, 0.8294, 0.7368, 0.6008, 0.4300, 0.7423], ['CultriX/Qwen2.5-14B-Hyper', 'https://hf.xwall.us.kg.m/CultriX/Qwen2.5-14B-Hyper', 0.7372, 0.8411, 0.7424, 0.5830, 0.4400, 0.7792], ['CultriX/Qwen2.5-14B-Hyperionv4', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv4', 0.7305, 0.8359, 0.7454, 0.5827, 0.4600, 0.7797], ['CultriX/Qwen2.5-14B-Hyperionv5', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv5', 0.7458, 0.8290, 0.7508, 0.6228, 0.5200, 0.7540], ['CultriX/Qwen2.5-14B-Hyperionv6', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv6', 0.7430, 0.8308, 0.7353, 0.6184, 0.4500, 0.7665], ['CultriX/Qwen2.5-14B-Hyperionv7', 'https://huggingface.co./CultriX/Qwen2.5-14B-Hyperionv7', 0.7412, 0.8287, 0.7508, 0.6208, 0.4800, 0.7532], ] columns = [ "Model Configuration", "Model Link", "tinyArc", "tinyHellaswag", "tinyMMLU", "tinyTruthfulQA", "tinyTruthfulQA_mc1", "tinyWinogrande" ] df_full = pd.DataFrame(data_full, columns=columns) def plot_average_scores(): df_full["Average Score"] = df_full.iloc[:, 2:].mean(axis=1) df_avg_sorted = df_full.sort_values(by="Average Score", ascending=False) plt.figure(figsize=(14, 10)) plt.barh(df_avg_sorted["Model Configuration"], df_avg_sorted["Average Score"]) plt.title("Average Performance of Models Across Tasks", fontsize=16) plt.xlabel("Average Score", fontsize=14) plt.ylabel("Model Configuration", fontsize=14) plt.gca().invert_yaxis() plt.grid(axis='x', linestyle='--', alpha=0.7) plt.tight_layout() img_buffer = io.BytesIO() plt.savefig(img_buffer, format='png') img_buffer.seek(0) img_base64 = base64.b64encode(img_buffer.read()).decode('utf-8') plt.close() pil_image = Image.open(BytesIO(base64.b64decode(img_base64))) temp_image_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) pil_image.save(temp_image_file.name) return pil_image, temp_image_file.name def plot_task_performance(): df_full_melted = df_full.melt( id_vars=["Model Configuration", "Model Link"], var_name="Task", value_name="Score" ) plt.figure(figsize=(16, 12)) for model in df_full["Model Configuration"]: model_data = df_full_melted[df_full_melted["Model Configuration"] == model] plt.plot(model_data["Task"], model_data["Score"], marker="o", label=model) plt.title("Performance of All Models Across Tasks", fontsize=16) plt.xlabel("Task", fontsize=14) plt.ylabel("Score", fontsize=14) plt.xticks(rotation=45) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9) plt.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout() img_buffer = io.BytesIO() plt.savefig(img_buffer, format='png') img_buffer.seek(0) img_base64 = base64.b64encode(img_buffer.read()).decode('utf-8') plt.close() pil_image = Image.open(BytesIO(base64.b64decode(img_base64))) temp_image_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) pil_image.save(temp_image_file.name) return pil_image, temp_image_file.name def plot_task_specific_top_models(): top_models = df_full.iloc[:, 2:].idxmax() top_scores = df_full.iloc[:, 2:].max() results = pd.DataFrame({"Top Model": top_models, "Score": top_scores}).reset_index().rename(columns={"index": "Task"}) plt.figure(figsize=(14, 8)) plt.bar(results["Task"], results["Score"]) plt.title("Task-Specific Top Models", fontsize=16) plt.xlabel("Task", fontsize=14) plt.ylabel("Score", fontsize=14) plt.grid(axis="y", linestyle="--", alpha=0.7) plt.tight_layout() img_buffer = io.BytesIO() plt.savefig(img_buffer, format='png') img_buffer.seek(0) img_base64 = base64.b64encode(img_buffer.read()).decode('utf-8') plt.close() pil_image = Image.open(BytesIO(base64.b64decode(img_base64))) temp_image_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) pil_image.save(temp_image_file.name) return pil_image, temp_image_file.name def plot_heatmap(): # Add a column for the total scores across all tasks df_full["Total Scores"] = df_full.iloc[:, 2:].sum(axis=1) # Normalize each column individually for consistent coloring normalized_data = df_full.iloc[:, 2:].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0) plt.figure(figsize=(14, 10)) sns.heatmap( normalized_data, annot=df_full.iloc[:, 2:], # Show actual values in annotations cmap="YlGnBu", xticklabels=list(columns[2:]) + ["Total Scores"], yticklabels=df_full["Model Configuration"] ) plt.title("Performance Heatmap", fontsize=16) plt.tight_layout() img_buffer = io.BytesIO() plt.savefig(img_buffer, format='png') img_buffer.seek(0) img_base64 = base64.b64encode(img_buffer.read()).decode('utf-8') plt.close() pil_image = Image.open(BytesIO(base64.b64decode(img_base64))) temp_image_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) pil_image.save(temp_image_file.name) return pil_image, temp_image_file.name def scrape_mergekit_config(model_name): """ For the *tiny* table’s model links. Scrapes
tags on the huggingface model page to find a YAML config. """ df_row = df_full.loc[df_full["Model Configuration"] == model_name] if df_row.empty: return f"No data found for model {model_name}." model_link = df_row["Model Link"].values[0] response = requests.get(model_link) if response.status_code != 200: return f"Failed to fetch model page for {model_name}. Please check the link." soup = BeautifulSoup(response.text, "html.parser") yaml_config = soup.find("pre") # Assume YAML is intags if yaml_config: return yaml_config.text.strip() return f"No YAML configuration found for {model_name}." def download_yaml(yaml_content, model_name): """ Let users download the scraped YAML if it exists. """ if "No YAML configuration found" in yaml_content or "Failed to fetch model page" in yaml_content: return None filename = f"{model_name.replace('/', '_')}_config.yaml" return gr.File(value=yaml_content.encode(), filename=filename) def scrape_model_page(model_url): """ Used for the "Live Scraping" text box in the Gradio UI. """ try: response = requests.get(model_url) if response.status_code != 200: return f"Error: Unable to fetch the page (Status Code: {response.status_code})" soup = BeautifulSoup(response.text, "html.parser") yaml_config = soup.find("pre") yaml_text = yaml_config.text.strip() if yaml_config else "No YAML configuration found." metadata_section = soup.find("div", class_="metadata") metadata_text = metadata_section.text.strip() if metadata_section else "No metadata found." return f"**YAML Configuration:**\n{yaml_text}\n\n**Metadata:**\n{metadata_text}" except Exception as e: return f"Error: {str(e)}" def display_scraped_model_data(model_url): """ Helper for the "Live Scraping Features" section of the Gradio app. """ return scrape_model_page(model_url) def download_all_data(): """ Builds and returns a zip of: - the CSV of your 'tiny' data, - four plots (average performance, task performance, top models, heatmap), - any YAML configurations for the 'tiny' table's models (if found). """ import io csv_buffer = io.StringIO() df_full.to_csv(csv_buffer, index=False) csv_data = csv_buffer.getvalue().encode('utf-8') average_plot_pil, average_plot_name = plot_average_scores() task_plot_pil, task_plot_name = plot_task_performance() top_models_plot_pil, top_models_plot_name = plot_task_specific_top_models() heatmap_plot_pil, heatmap_plot_name = plot_heatmap() plot_dict = { "average_performance": (average_plot_pil, average_plot_name), "task_performance": (task_plot_pil, task_plot_name), "top_models": (top_models_plot_pil, top_models_plot_name), "heatmap": (heatmap_plot_pil, heatmap_plot_name) } zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, 'w') as zf: zf.writestr("model_scores.csv", csv_data) # Add the images for name, (pil_image, filename) in plot_dict.items(): image_bytes = io.BytesIO() pil_image.save(image_bytes, format='PNG') image_bytes.seek(0) zf.writestr(filename, image_bytes.read()) # Also try scraping each model in the *tiny* dataset for a YAML config for model_name in df_full["Model Configuration"].to_list(): yaml_content = scrape_mergekit_config(model_name) if ("No YAML configuration found" not in yaml_content) and ("Failed to fetch model page" not in yaml_content): zf.writestr(f"{model_name.replace('/', '_')}_config.yaml", yaml_content.encode()) zip_buffer.seek(0) return zip_buffer, "analysis_data.zip" # -------------------------------------------------------------------- # PART 2: THE "DATA START" SNIPPET (RANKS 44–105) + Parser # -------------------------------------------------------------------- # This is your larger dataset, rank = 44..105 benchmark_data = [ { "rank": 44, "name": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3", "scores": { "average": 40.10, "IFEval": 72.57, "BBH": 48.58, "MATH": 34.44, "GPQA": 17.34, "MUSR": 19.39, "MMLU-PRO": 48.26 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwen2.5-14B-Vimarckoso-v3", "known_config": { "models": [ {"model": "CultriX/SeQwence-14Bv1"}, {"model": "allknowingroger/Qwenslerp5-14B"} ], "merge_method": "slerp", "base_model": "CultriX/SeQwence-14Bv1", "dtype": "bfloat16", "parameters": { "t": [0, 0.5, 1, 0.5, 0] } } }, { "rank": 45, "name": "sthenno-com/miscii-14b-1225", "scores": { "average": 40.08, "IFEval": 78.78, "BBH": 50.91, "MATH": 31.57, "GPQA": 17.00, "MUSR": 14.77, "MMLU-PRO": 47.46 }, "hf_url": "https://huggingface.co./sthenno-com/miscii-14b-1225", "known_config": None }, { "rank": 46, "name": "djuna/Q2.5-Veltha-14B-0.5", "scores": { "average": 39.96, "IFEval": 77.96, "BBH": 50.32, "MATH": 33.84, "GPQA": 15.77, "MUSR": 14.17, "MMLU-PRO": 47.72 }, "hf_url": "https://huggingface.co./djuna/Q2.5-Veltha-14B-0.5", "known_config": None }, { "rank": 48, "name": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock", "scores": { "average": 39.81, "IFEval": 71.62, "BBH": 48.76, "MATH": 33.99, "GPQA": 17.34, "MUSR": 19.23, "MMLU-PRO": 47.95 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock", "known_config": None }, { "rank": 50, "name": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01", "scores": { "average": 39.46, "IFEval": 68.72, "BBH": 47.71, "MATH": 35.05, "GPQA": 18.23, "MUSR": 19.56, "MMLU-PRO": 47.50 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01", "known_config": None }, { "rank": 52, "name": "arcee-ai/Virtuoso-Small", "scores": { "average": 39.43, "IFEval": 79.35, "BBH": 50.40, "MATH": 34.29, "GPQA": 11.52, "MUSR": 14.44, "MMLU-PRO": 46.57 }, "hf_url": "https://huggingface.co./arcee-ai/Virtuoso-Small", "known_config": None }, { "rank": 54, "name": "sometimesanotion/Qwentinuum-14B-v6", "scores": { "average": 39.23, "IFEval": 63.04, "BBH": 50.23, "MATH": 33.84, "GPQA": 18.23, "MUSR": 21.18, "MMLU-PRO": 48.89 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v6", "known_config": None }, { "rank": 55, "name": "djuna/Q2.5-Veltha-14B", "scores": { "average": 39.21, "IFEval": 82.92, "BBH": 49.75, "MATH": 28.02, "GPQA": 14.54, "MUSR": 12.26, "MMLU-PRO": 47.76 }, "hf_url": "https://huggingface.co./djuna/Q2.5-Veltha-14B", "known_config": None }, { "rank": 57, "name": "allknowingroger/QwenSlerp6-14B", "scores": { "average": 39.02, "IFEval": 68.67, "BBH": 47.59, "MATH": 34.14, "GPQA": 16.44, "MUSR": 18.32, "MMLU-PRO": 48.95 }, "hf_url": "https://huggingface.co./allknowingroger/QwenSlerp6-14B", "known_config": None }, { "rank": 58, "name": "allknowingroger/QwenSlerp5-14B", "scores": { "average": 38.94, "IFEval": 71.19, "BBH": 47.39, "MATH": 33.16, "GPQA": 15.32, "MUSR": 17.81, "MMLU-PRO": 48.78 }, "hf_url": "https://huggingface.co./allknowingroger/QwenSlerp5-14B", "known_config": None }, { "rank": 59, "name": "sometimesanotion/Qwentinuum-14B-v5", "scores": { "average": 38.87, "IFEval": 62.86, "BBH": 50.28, "MATH": 31.57, "GPQA": 18.34, "MUSR": 21.09, "MMLU-PRO": 49.09 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v5", "known_config": None }, { "rank": 60, "name": "sometimesanotion/Qwenvergence-14B-v6-Prose", "scores": { "average": 38.82, "IFEval": 59.90, "BBH": 50.12, "MATH": 34.89, "GPQA": 18.46, "MUSR": 21.02, "MMLU-PRO": 48.56 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwenvergence-14B-v6-Prose", "known_config": None }, { "rank": 61, "name": "CultriX/Qwen2.5-14B-Brocav3", "scores": { "average": 38.76, "IFEval": 69.52, "BBH": 49.05, "MATH": 32.25, "GPQA": 14.54, "MUSR": 19.25, "MMLU-PRO": 47.97 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Brocav3", "known_config": None }, { "rank": 62, "name": "sometimesanotion/Qwentinuum-14B-v7", "scores": { "average": 38.76, "IFEval": 61.09, "BBH": 50.35, "MATH": 33.38, "GPQA": 18.79, "MUSR": 19.95, "MMLU-PRO": 49.00 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v7", "known_config": None }, { "rank": 64, "name": "sometimesanotion/Qwentinuum-14B-v3", "scores": { "average": 38.74, "IFEval": 61.58, "BBH": 50.04, "MATH": 32.85, "GPQA": 18.34, "MUSR": 20.62, "MMLU-PRO": 49.03 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v3", "known_config": None }, { "rank": 65, "name": "allura-org/TQ2.5-14B-Aletheia-v1", "scores": { "average": 38.74, "IFEval": 75.30, "BBH": 50.88, "MATH": 29.53, "GPQA": 14.99, "MUSR": 14.61, "MMLU-PRO": 47.12 }, "hf_url": "https://huggingface.co./allura-org/TQ2.5-14B-Aletheia-v1", "known_config": None }, { "rank": 66, "name": "qingy2024/Fusion4-14B-Instruct", "scores": { "average": 38.73, "IFEval": 76.49, "BBH": 50.70, "MATH": 33.91, "GPQA": 10.74, "MUSR": 13.97, "MMLU-PRO": 46.60 }, "hf_url": "https://huggingface.co./qingy2024/Fusion4-14B-Instruct", "known_config": None }, { "rank": 68, "name": "CultriX/Qwen2.5-14B-Brocav7", "scores": { "average": 38.52, "IFEval": 67.24, "BBH": 48.91, "MATH": 31.87, "GPQA": 15.66, "MUSR": 20.15, "MMLU-PRO": 47.31 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Brocav7", "known_config": None }, { "rank": 71, "name": "sometimesanotion/Qwentinuum-14B-v6-Prose", "scores": { "average": 38.46, "IFEval": 56.43, "BBH": 50.14, "MATH": 35.57, "GPQA": 18.46, "MUSR": 21.34, "MMLU-PRO": 48.80 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v6-Prose", "known_config": None }, { "rank": 76, "name": "CultriX/Qwen2.5-14B-Brocav6", "scores": { "average": 38.32, "IFEval": 69.95, "BBH": 47.82, "MATH": 29.61, "GPQA": 15.66, "MUSR": 18.88, "MMLU-PRO": 47.99 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Brocav6", "known_config": None }, { "rank": 80, "name": "CultriX/SeQwence-14Bv1", "scores": { "average": 38.20, "IFEval": 66.78, "BBH": 47.19, "MATH": 33.53, "GPQA": 14.88, "MUSR": 18.80, "MMLU-PRO": 48.00 }, "hf_url": "https://huggingface.co./CultriX/SeQwence-14Bv1", "known_config": None }, { "rank": 85, "name": "sometimesanotion/Qwentinuum-14B-v013", "scores": { "average": 37.96, "IFEval": 67.11, "BBH": 43.97, "MATH": 33.01, "GPQA": 14.32, "MUSR": 24.99, "MMLU-PRO": 44.34 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v013", "known_config": None }, { "rank": 86, "name": "CultriX/Qwen2.5-14B-Wernickev3", "scores": { "average": 37.94, "IFEval": 70.48, "BBH": 44.58, "MATH": 32.78, "GPQA": 14.99, "MUSR": 18.69, "MMLU-PRO": 46.13 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Wernickev3", "known_config": None }, { "rank": 88, "name": "allknowingroger/QwenSlerp4-14B", "scores": { "average": 37.80, "IFEval": 63.28, "BBH": 49.38, "MATH": 30.97, "GPQA": 16.33, "MUSR": 17.59, "MMLU-PRO": 49.28 }, "hf_url": "https://huggingface.co./allknowingroger/QwenSlerp4-14B", "known_config": None }, { "rank": 89, "name": "CultriX/Qwen2.5-14B-Broca", "scores": { "average": 37.72, "IFEval": 56.04, "BBH": 50.03, "MATH": 34.59, "GPQA": 18.23, "MUSR": 18.95, "MMLU-PRO": 48.49 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Broca", "known_config": None }, { "rank": 90, "name": "CultriX/Qwen2.5-14B-Emerged", "scores": { "average": 37.66, "IFEval": 70.00, "BBH": 45.93, "MATH": 30.74, "GPQA": 14.32, "MUSR": 18.47, "MMLU-PRO": 46.51 }, "hf_url": "https://huggingface.co./CultriX/Qwen2.5-14B-Emerged", "known_config": None }, { "rank": 91, "name": "sometimesanotion/Qwentinuum-14B-v8", "scores": { "average": 37.65, "IFEval": 54.12, "BBH": 50.11, "MATH": 34.14, "GPQA": 17.79, "MUSR": 20.75, "MMLU-PRO": 49.02 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwentinuum-14B-v8", "known_config": None }, { "rank": 92, "name": "qingy2024/Fusion-14B-Instruct", "scores": { "average": 37.64, "IFEval": 72.60, "BBH": 48.58, "MATH": 30.97, "GPQA": 13.98, "MUSR": 14.81, "MMLU-PRO": 44.93 }, "hf_url": "https://huggingface.co./qingy2024/Fusion-14B-Instruct", "known_config": None }, { "rank": 94, "name": "CultriX/Qwestion-14B", "scores": { "average": 37.63, "IFEval": 63.18, "BBH": 48.76, "MATH": 31.72, "GPQA": 15.77, "MUSR": 17.22, "MMLU-PRO": 49.14 }, "hf_url": "https://huggingface.co./CultriX/Qwestion-14B", "known_config": None }, { "rank": 99, "name": "sometimesanotion/Qwenvergence-14B-v3-Prose", "scores": { "average": 37.37, "IFEval": 49.18, "BBH": 49.80, "MATH": 35.57, "GPQA": 19.35, "MUSR": 21.77, "MMLU-PRO": 48.55 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwenvergence-14B-v3-Prose", "known_config": None }, { "rank": 102, "name": "CultriX/SeQwence-14B-v5", "scores": { "average": 37.27, "IFEval": 59.20, "BBH": 50.00, "MATH": 31.04, "GPQA": 16.00, "MUSR": 18.33, "MMLU-PRO": 49.05 }, "hf_url": "https://huggingface.co./CultriX/SeQwence-14B-v5", "known_config": None }, { "rank": 103, "name": "sometimesanotion/Qwen-14B-ProseStock-v4", "scores": { "average": 37.23, "IFEval": 49.42, "BBH": 49.54, "MATH": 35.50, "GPQA": 18.46, "MUSR": 21.70, "MMLU-PRO": 48.74 }, "hf_url": "https://huggingface.co./sometimesanotion/Qwen-14B-ProseStock-v4", "known_config": None }, { "rank": 104, "name": "sometimesanotion/IF-reasoning-experiment-40", "scores": { "average": 37.21, "IFEval": 63.30, "BBH": 44.31, "MATH": 27.72, "GPQA": 17.34, "MUSR": 25.86, "MMLU-PRO": 44.72 }, "hf_url": "https://huggingface.co./sometimesanotion/IF-reasoning-experiment-40", "known_config": None }, { "rank": 105, "name": "CultriX/SeQwence-14B-EvolMerge", "scores": { "average": 37.20, "IFEval": 53.82, "BBH": 50.78, "MATH": 31.80, "GPQA": 17.45, "MUSR": 20.26, "MMLU-PRO": 49.10 }, "hf_url": "https://huggingface.co./CultriX/SeQwence-14B-EvolMerge", "known_config": None } ] def snippet_scrape_model_page(url): """ Equivalent scraping function for the larger dataset to look forYAML and a .metadata section. """ try: response = requests.get(url) if response.status_code != 200: return f"Error: Unable to fetch the page (Status Code: {response.status_code})" soup = BeautifulSoup(response.text, "html.parser") yaml_config = soup.find("pre") yaml_text = yaml_config.text.strip() if yaml_config else "No YAML configuration found." metadata_section = soup.find("div", class_="metadata") metadata_text = metadata_section.text.strip() if metadata_section else "No metadata found." return { "yaml_configuration": yaml_text, "metadata": metadata_text } except Exception as e: return f"Error: {str(e)}" def snippet_print_benchmark_and_config_info(model_info): """ Prints an overview for each model in the rank=44..105 dataset. If known_config is not None, prints it. Otherwise attempts to scrape. """ print(f"---\nModel Rank: {model_info['rank']}") print(f"Model Name: {model_info['name']}") print(f"Model average score across benchmarks in %: {model_info['scores']['average']}") print(f"Models average score on IFEval benchmarks in %: {model_info['scores']['IFEval']}") print(f"Models average score on BBH benchmarks in %: {model_info['scores']['BBH']}") print(f"Models average score on MATH benchmarks in %: {model_info['scores']['MATH']}") print(f"Models average score in GPQA benchmarks in %: {model_info['scores']['GPQA']}") print(f"Models average score in MUSR benchmarks in %: {model_info['scores']['MUSR']}") print(f"Models average score in MMLU-PRO benchmarks in %: {model_info['scores']['MMLU-PRO']}") # If there's a known_config, print it in YAML form and stop. if model_info["known_config"] is not None: print("###") print("models:") for m in model_info["known_config"]["models"]: print(f" - model: {m['model']}") print(f"merge_method: {model_info['known_config']['merge_method']}") print(f"base_model: {model_info['known_config']['base_model']}") print(f"dtype: {model_info['known_config']['dtype']}") print("parameters:") t_vals = model_info["known_config"]["parameters"]["t"] print(f" t: {t_vals} # V shaped curve: Hermes for input & output, WizardMath in the middle layers") print("###") return # Otherwise, do scraping: scraped = snippet_scrape_model_page(model_info["hf_url"]) if isinstance(scraped, str): # Means it's an error string or something print("(No MergeKit configuration found or scraping error.)") print(scraped) return else: # It's presumably a dict if "No YAML configuration found." in scraped["yaml_configuration"]: print("(No MergeKit configuration found.)\n") print("You can try the following Python script to scrape the model page:\n") print("#" * 70) print(f'''import requests from bs4 import BeautifulSoup def scrape_model_page(model_url): try: response = requests.get(model_url) if response.status_code != 200: return f"Error: Unable to fetch the page (Status Code: {{response.status_code}})" soup = BeautifulSoup(response.text, "html.parser") yaml_config = soup.find("pre") yaml_text = yaml_config.text.strip() if yaml_config else "No YAML configuration found." metadata_section = soup.find("div", class_="metadata") metadata_text = metadata_section.text.strip() if metadata_section else "No metadata found." return {{ "yaml_configuration": yaml_text, "metadata": metadata_text }} except Exception as e: return f"Error: {{str(e)}}" if __name__ == "__main__": model_url = "{model_info['hf_url']}" result = scrape_model_page(model_url) print(result)''') print("#" * 70) else: # Found some YAML print("###") print(scraped["yaml_configuration"]) print("###") def run_non_tiny_benchmarks(): """ Captures the stdout from printing each model in benchmark_data (ranks 44..105), returning the entire output as a single string for Gradio to display. """ old_stdout = sys.stdout buffer = io.StringIO() sys.stdout = buffer for model in benchmark_data: snippet_print_benchmark_and_config_info(model) sys.stdout = old_stdout return buffer.getvalue() # -------------------------------------------------------------------- # PART 3: The Gradio App # -------------------------------------------------------------------- with gr.Blocks() as demo: gr.Markdown("# Comprehensive Model Performance Analysis with Hugging Face Links") # The existing UI for the “tiny” data with gr.Row(): btn1 = gr.Button("Show Average Performance") img1 = gr.Image(type="pil", label="Average Performance Plot") img1_download = gr.File(label="Download Average Performance") btn1.click(plot_average_scores, outputs=[img1, img1_download]) with gr.Row(): btn2 = gr.Button("Show Task Performance") img2 = gr.Image(type="pil", label="Task Performance Plot") img2_download = gr.File(label="Download Task Performance") btn2.click(plot_task_performance, outputs=[img2, img2_download]) with gr.Row(): btn3 = gr.Button("Task-Specific Top Models") img3 = gr.Image(type="pil", label="Task-Specific Top Models Plot") img3_download = gr.File(label="Download Top Models") btn3.click(plot_task_specific_top_models, outputs=[img3, img3_download]) with gr.Row(): btn4 = gr.Button("Plot Performance Heatmap") heatmap_img = gr.Image(type="pil", label="Performance Heatmap") heatmap_download = gr.File(label="Download Heatmap") btn4.click(plot_heatmap, outputs=[heatmap_img, heatmap_download]) # Scraping & YAML handling for the *tiny* table with gr.Row(): model_selector = gr.Dropdown(choices=df_full["Model Configuration"].tolist(), label="Select a Model") with gr.Column(): scrape_btn = gr.Button("Scrape MergeKit Configuration") yaml_output = gr.Textbox(lines=10, placeholder="YAML Configuration will appear here.") scrape_btn.click(scrape_mergekit_config, inputs=model_selector, outputs=yaml_output) with gr.Column(): save_yaml_btn = gr.Button("Save MergeKit Configuration") yaml_download = gr.File(label="Download MergeKit Configuration") save_yaml_btn.click(download_yaml, inputs=[yaml_output, model_selector], outputs=yaml_download) # Download everything (CSV, plots, any found YAML) with gr.Row(): download_all_btn = gr.Button("Download Everything") all_downloads = gr.File(label="Download All Data") download_all_btn.click(download_all_data, outputs=all_downloads) # Live Scraping gr.Markdown("## Live Scraping Features") with gr.Row(): url_input = gr.Textbox(label="Enter Hugging Face Model URL", placeholder="https://huggingface.co./") live_scrape_btn = gr.Button("Scrape Model Page") live_scrape_output = gr.Textbox(label="Scraped Data", lines=15) live_scrape_btn.click(display_scraped_model_data, inputs=url_input, outputs=live_scrape_output) # Non-Tiny Benchmarks gr.Markdown("## Non-Tiny Benchmark Parser (Ranks 44–105)") with gr.Row(): parse_non_tiny_btn = gr.Button("Parse Non-Tiny Benchmarks") parse_non_tiny_output = gr.Textbox(label="Non-Tiny Benchmark Output", lines=30) parse_non_tiny_btn.click(fn=run_non_tiny_benchmarks, outputs=parse_non_tiny_output) demo.launch()