{ "cells": [ { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd\n", "\n", "TITLE = \"\"\"

LLM Leaderboard for H4 Models

\"\"\"\n", "\n", "DESCRIPTION = f\"\"\"\n", "Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.\n", "\"\"\"\n", "\n", "BENCHMARKS_TO_SKIP = [\"math\", \"mini_math\"]\n", "\n", "\n", "def get_leaderboard_df(agg : str = \"max\"):\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4]\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # Skip benchmarks that we don't want to include in the leaderboard\n", " if task.lower() in BENCHMARKS_TO_SKIP:\n", " continue\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task.lower() == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " # IFEval has several metrics but we report just the prompt-loose-acc one\n", " elif task.lower() == \"ifeval\":\n", " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n", " # MMLU has several metrics but we report just the average one\n", " elif task.lower() == \"mmlu\":\n", " value = [v[\"acc\"] for k, v in data[\"results\"].items() if \"_average\" in k.lower()][0]\n", " # HellaSwag and ARC reports acc_norm\n", " elif task.lower() in [\"hellaswag\", \"arc\"]:\n", " value = data[\"results\"][first_result_key][\"acc_norm\"]\n", " # BBH has several metrics but we report just the average one\n", " elif task.lower() == \"bbh\":\n", " if \"all\" in data[\"results\"]:\n", " value = data[\"results\"][\"all\"][\"acc\"]\n", " else:\n", " value = -100\n", " # AGIEval reports acc_norm\n", " elif task.lower() == \"agieval\":\n", " value = data[\"results\"][\"all\"][\"acc_norm\"]\n", " # MATH reports qem\n", " elif task.lower() in [\"math\", \"math_v2\", \"aimo_kaggle\"]:\n", " value = data[\"results\"][\"all\"][\"qem\"]\n", " else:\n", " first_metric_key = next(\n", " iter(data[\"results\"][first_result_key])\n", " ) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", "\n", " # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe\n", " if task.lower() in [\"mini_math_v2\"]:\n", " for k, v in data[\"results\"].items():\n", " if k != \"all\":\n", " level = k.split(\"|\")[1].split(\":\")[-1]\n", " value = v[\"qem\"]\n", " df.loc[model_revision, f\"{task}_{level}\"] = value\n", " # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n", " elif task.lower() in [\"aimo_kaggle_medium_pot\"]:\n", " for k, v in data[\"results\"].items():\n", " if k != \"all\" and \"_average\" not in k:\n", " version = k.split(\"|\")[1].split(\":\")[-1]\n", " value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n", " df.loc[model_revision, f\"{task}_{version}\"] = value\n", " # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n", " elif task.lower() in [\"aimo_kaggle_hard_pot\"]:\n", " for k, v in data[\"results\"].items():\n", " if k != \"all\" and \"_average\" not in k:\n", " version = k.split(\"|\")[1].split(\":\")[-1]\n", " value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n", " df.loc[model_revision, f\"{task}_{version}\"] = value\n", " # For kaggle_tora we report accuracy, so need to divide by 100\n", " elif task.lower() in [\n", " \"aimo_tora_eval_kaggle_medium\",\n", " \"aimo_tora_eval_kaggle_hard\",\n", " \"aimo_kaggle_fast_eval_hard\",\n", " \"aimo_kaggle_tora_medium\",\n", " \"aimo_kaggle_tora_hard\",\n", " \"aimo_kaggle_tora_medium_extended\",\n", " \"aimo_kaggle_tora_hard_extended\",\n", " ]:\n", " for k, v in data[\"results\"].items():\n", " value = float(v[\"qem\"]) / 100.0\n", " df.loc[model_revision, f\"{task}\"] = value\n", " # For AlpacaEval we report base winrate and lenght corrected one\n", " elif task.lower() == \"alpaca_eval\":\n", " value = data[\"results\"][first_result_key][\"win_rate\"]\n", " df.loc[model_revision, \"Alpaca_eval\"] = value / 100.0\n", " value = data[\"results\"][first_result_key][\"length_controlled_winrate\"]\n", " df.loc[model_revision, \"Alpaca_eval_lc\"] = value / 100.0\n", " else:\n", " df.loc[model_revision, task] = float(value)\n", "\n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", "\n", " # Trim minimath column names\n", " df.columns = [c.replace(\"_level_\", \"_l\") for c in df.columns]\n", "\n", " # Trim AIMO column names\n", " df.columns = [c.replace(\"aimo_\", \"\") for c in df.columns]\n", "\n", " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", "\n", " # Convert all values to percentage\n", " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", "\n", " # Drop date and aggregate results by model name\n", " df = df.drop(\"Date\", axis=1).groupby(\"Model\").agg(agg).reset_index()\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df(agg='mean')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# df" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelAveragekaggle_tora_medium_extendedkaggle_tora_hard_extended
1741AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits28.8961.4528.89
\n", "
" ], "text/plain": [ " Model Average \\\n", "1741 AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits 28.89 \n", "\n", " kaggle_tora_medium_extended kaggle_tora_hard_extended \n", "1741 61.45 28.89 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query(\"Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'\").dropna(axis=1, how=\"all\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelAveragekaggle_tora_medium_extendedkaggle_tora_hard_extended
1741AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits65.0665.0632.22
\n", "
" ], "text/plain": [ " Model Average \\\n", "1741 AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits 65.06 \n", "\n", " kaggle_tora_medium_extended kaggle_tora_hard_extended \n", "1741 65.06 32.22 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query(\"Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'\").dropna(axis=1, how=\"all\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelDateIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
0NousResearch_Nous-Hermes-2-Yi-34B_main2024-03-0439.0061.4480.5867.9376.2483.7968.00
1deepseek-ai_deepseek-llm-67b-chat_main2024-03-0555.2757.7879.1676.1271.1883.9464.16
2NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main2024-03-0259.3364.7678.5362.1771.9685.4270.82
3mistralai_Mixtral-8x7B-Instruct-v0.1_main2024-03-0255.0870.7973.5659.8970.6086.6872.01
4deepseek-ai_deepseek-llm-67b-chat_main2024-03-0455.2757.7879.1676.1271.1883.9464.16
..............................
269HuggingFaceH4_starcoder2-15b-ift_v18.02024-03-1021.63NaNNaN0.83NaNNaNNaN
270HuggingFaceH4_mistral-7b-ift_v49.02024-03-0720.15NaNNaN0.00NaNNaNNaN
271HuggingFaceH4_starchat-beta_main2024-03-128.13NaNNaNNaNNaNNaNNaN
272HuggingFaceH4_starcoder2-15b-ift_v7.02024-03-1012.57NaNNaN3.18NaNNaNNaN
273HuggingFaceH4_zephyr-7b-beta-ift_v1.12024-03-139.43NaNNaN0.00NaNNaNNaN
\n", "

274 rows × 9 columns

\n", "
" ], "text/plain": [ " Model Date Ifeval \\\n", "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 39.00 \n", "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 55.27 \n", "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 59.33 \n", "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 55.08 \n", "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 55.27 \n", ".. ... ... ... \n", "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 21.63 \n", "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 20.15 \n", "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n", "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 12.57 \n", "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 9.43 \n", "\n", " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n", "0 61.44 80.58 67.93 76.24 83.79 68.00 \n", "1 57.78 79.16 76.12 71.18 83.94 64.16 \n", "2 64.76 78.53 62.17 71.96 85.42 70.82 \n", "3 70.79 73.56 59.89 70.60 86.68 72.01 \n", "4 57.78 79.16 76.12 71.18 83.94 64.16 \n", ".. ... ... ... ... ... ... \n", "269 NaN NaN 0.83 NaN NaN NaN \n", "270 NaN NaN 0.00 NaN NaN NaN \n", "271 NaN NaN NaN NaN NaN NaN \n", "272 NaN NaN 3.18 NaN NaN NaN \n", "273 NaN NaN 0.00 NaN NaN NaN \n", "\n", "[274 rows x 9 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"Model\", \"Date\"]].merge(new_df, on=\"Model\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }