Spaces:

leukas
/

cute_leaderboard

Sleeping

App Files Files Community

leukas commited on Sep 22, 2024

Commit

f7ac2a9

1 Parent(s): 80cbf2f

initial commit

Browse files

Files changed (11) hide show

README.md +2 -2
app.py +85 -0
src/__pycache__/about.cpython-310.pyc +0 -0
src/__pycache__/envs.cpython-310.pyc +0 -0
src/about.py +47 -0
src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
src/display/__pycache__/utils.cpython-310.pyc +0 -0
src/display/css_html_js.py +105 -0
src/display/formatting.py +27 -0
src/display/utils.py +110 -0
src/envs.py +21 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Cute Leaderboard
-emoji: 🔥
 colorFrom: blue
 colorTo: blue
 sdk: gradio
@@ -9,4 +9,4 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Cute Leaderboard
+emoji: 🎀
 colorFrom: blue
 colorTo: blue
 sdk: gradio
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    AutoEvalColumn,
+    fields,
+)
+from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+LEADERBOARD_DF = pd.read_csv(f"{EVAL_RESULTS_PATH}/results.csv")
+# multiply each column by 100, except first column
+for col in LEADERBOARD_DF.columns[1:]:
+    LEADERBOARD_DF[col] = LEADERBOARD_DF[col] * 100
+# add average column
+LEADERBOARD_DF["Average"] = LEADERBOARD_DF.mean(axis=1)
+# # move average to the front
+LEADERBOARD_DF = LEADERBOARD_DF[["model", "Average"] + [col for col in LEADERBOARD_DF.columns if col not in ["model", "Average"]]]
+# round to nearest tenth
+LEADERBOARD_DF = LEADERBOARD_DF.round(1)
+# sort by average
+LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by="Average", ascending=False)
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/__pycache__/about.cpython-310.pyc ADDED Viewed

Binary file (1.45 kB). View file

src/__pycache__/envs.cpython-310.pyc ADDED Viewed

Binary file (480 Bytes). View file

src/about.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("Spelling", "exact_match", "spell")
+    task1 = Task("Inverse Spelling", "exact_match", "spell_inverse")
+    task2 = Task("Contains Char", "exact_match", "cont_char")
+    task3 = Task("contains_word", "exact_match", "cont_word")
+    task4 = Task("orth", "exact_match", "orth")
+    task5 = Task("sem", "exact_match", "sem")
+    task6 = Task("insert_char", "exact_match", "ins_char")
+    task7 = Task("insert_word", "exact_match", "ins_word")
+    task8 = Task("del_char", "exact_match", "del_char")
+    task9 = Task("del_word", "exact_match", "del_word")
+    task10 = Task("sub_char", "exact_match", "sub_char")
+    task11 = Task("sub_word", "exact_match", "sub_word")
+    task12 = Task("swap_char", "exact_match", "swap_char")
+    task13 = Task("swap_word", "exact_match", "swap_word")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">CUTE Leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+This is the evaluation leaderboard for CUTE, an orthographic understanding benchmark.
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## How it works
+For more details, visit our repo: https://github.com/leukas/cute
+"""

src/display/__pycache__/css_html_js.cpython-310.pyc ADDED Viewed

Binary file (1.9 kB). View file

src/display/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.18 kB). View file

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# Init
+# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "leukas" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/cute_leaderboard"
+RESULTS_REPO = f"{OWNER}/cute_results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
+API = HfApi(token=TOKEN)