leukas commited on
Commit
f7ac2a9
Β·
1 Parent(s): 80cbf2f

initial commit

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Cute Leaderboard
3
- emoji: πŸ”₯
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
@@ -9,4 +9,4 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Cute Leaderboard
3
+ emoji: πŸŽ€
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
 
9
  pinned: false
10
  ---
11
 
12
+
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ INTRODUCTION_TEXT,
9
+ LLM_BENCHMARKS_TEXT,
10
+ TITLE,
11
+ )
12
+ from src.display.css_html_js import custom_css
13
+ from src.display.utils import (
14
+ AutoEvalColumn,
15
+ fields,
16
+ )
17
+ from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
18
+
19
+
20
+ def restart_space():
21
+ API.restart_space(repo_id=REPO_ID)
22
+
23
+ ### Space initialisation
24
+ try:
25
+ print(EVAL_RESULTS_PATH)
26
+ snapshot_download(
27
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
28
+ )
29
+ except Exception:
30
+ restart_space()
31
+
32
+
33
+ LEADERBOARD_DF = pd.read_csv(f"{EVAL_RESULTS_PATH}/results.csv")
34
+ # multiply each column by 100, except first column
35
+ for col in LEADERBOARD_DF.columns[1:]:
36
+ LEADERBOARD_DF[col] = LEADERBOARD_DF[col] * 100
37
+
38
+ # add average column
39
+ LEADERBOARD_DF["Average"] = LEADERBOARD_DF.mean(axis=1)
40
+
41
+ # # move average to the front
42
+ LEADERBOARD_DF = LEADERBOARD_DF[["model", "Average"] + [col for col in LEADERBOARD_DF.columns if col not in ["model", "Average"]]]
43
+
44
+ # round to nearest tenth
45
+ LEADERBOARD_DF = LEADERBOARD_DF.round(1)
46
+
47
+ # sort by average
48
+ LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by="Average", ascending=False)
49
+
50
+
51
+ def init_leaderboard(dataframe):
52
+ if dataframe is None or dataframe.empty:
53
+ raise ValueError("Leaderboard DataFrame is empty or None.")
54
+ return Leaderboard(
55
+ value=dataframe,
56
+ datatype=[c.type for c in fields(AutoEvalColumn)],
57
+ select_columns=SelectColumns(
58
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
59
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
60
+ label="Select Columns to Display:",
61
+ ),
62
+ search_columns=[AutoEvalColumn.model.name],
63
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
64
+ bool_checkboxgroup_label="Hide models",
65
+ interactive=False,
66
+ )
67
+
68
+
69
+ demo = gr.Blocks(css=custom_css)
70
+ with demo:
71
+ gr.HTML(TITLE)
72
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
73
+
74
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
75
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
76
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
77
+
78
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
79
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
80
+
81
+
82
+ scheduler = BackgroundScheduler()
83
+ scheduler.add_job(restart_space, "interval", seconds=1800)
84
+ scheduler.start()
85
+ demo.queue(default_concurrency_limit=40).launch()
src/__pycache__/about.cpython-310.pyc ADDED
Binary file (1.45 kB). View file
 
src/__pycache__/envs.cpython-310.pyc ADDED
Binary file (480 Bytes). View file
 
src/about.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("Spelling", "exact_match", "spell")
16
+ task1 = Task("Inverse Spelling", "exact_match", "spell_inverse")
17
+ task2 = Task("Contains Char", "exact_match", "cont_char")
18
+ task3 = Task("contains_word", "exact_match", "cont_word")
19
+ task4 = Task("orth", "exact_match", "orth")
20
+ task5 = Task("sem", "exact_match", "sem")
21
+ task6 = Task("insert_char", "exact_match", "ins_char")
22
+ task7 = Task("insert_word", "exact_match", "ins_word")
23
+ task8 = Task("del_char", "exact_match", "del_char")
24
+ task9 = Task("del_word", "exact_match", "del_word")
25
+ task10 = Task("sub_char", "exact_match", "sub_char")
26
+ task11 = Task("sub_word", "exact_match", "sub_word")
27
+ task12 = Task("swap_char", "exact_match", "swap_char")
28
+ task13 = Task("swap_word", "exact_match", "swap_word")
29
+
30
+
31
+
32
+ NUM_FEWSHOT = 0 # Change with your few shot
33
+ # ---------------------------------------------------
34
+
35
+ # Your leaderboard name
36
+ TITLE = """<h1 align="center" id="space-title">CUTE Leaderboard</h1>"""
37
+
38
+ # What does your leaderboard evaluate?
39
+ INTRODUCTION_TEXT = """
40
+ This is the evaluation leaderboard for CUTE, an orthographic understanding benchmark.
41
+ """
42
+
43
+ # Which evaluations are you running? how can people reproduce what you have?
44
+ LLM_BENCHMARKS_TEXT = f"""
45
+ ## How it works
46
+ For more details, visit our repo: https://github.com/leukas/cute
47
+ """
src/display/__pycache__/css_html_js.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
src/display/__pycache__/utils.cpython-310.pyc ADDED
Binary file (4.18 kB). View file
 
src/display/css_html_js.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ table td:first-child,
43
+ table th:first-child {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+ """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
30
+ for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
+ # Model information
33
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
40
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
+
43
+ # We use make dataclass to dynamically fill the scores from Tasks
44
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
+
46
+ ## For the queue columns in the submission tab
47
+ @dataclass(frozen=True)
48
+ class EvalQueueColumn: # Queue column
49
+ model = ColumnContent("model", "markdown", True)
50
+ revision = ColumnContent("revision", "str", True)
51
+ private = ColumnContent("private", "bool", True)
52
+ precision = ColumnContent("precision", "str", True)
53
+ weight_type = ColumnContent("weight_type", "str", "Original")
54
+ status = ColumnContent("status", "str", True)
55
+
56
+ ## All the model information that we might need
57
+ @dataclass
58
+ class ModelDetails:
59
+ name: str
60
+ display_name: str = ""
61
+ symbol: str = "" # emoji
62
+
63
+
64
+ class ModelType(Enum):
65
+ PT = ModelDetails(name="pretrained", symbol="🟒")
66
+ FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
67
+ IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
68
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
+ Unknown = ModelDetails(name="", symbol="?")
70
+
71
+ def to_str(self, separator=" "):
72
+ return f"{self.value.symbol}{separator}{self.value.name}"
73
+
74
+ @staticmethod
75
+ def from_str(type):
76
+ if "fine-tuned" in type or "πŸ”Ά" in type:
77
+ return ModelType.FT
78
+ if "pretrained" in type or "🟒" in type:
79
+ return ModelType.PT
80
+ if "RL-tuned" in type or "🟦" in type:
81
+ return ModelType.RL
82
+ if "instruction-tuned" in type or "β­•" in type:
83
+ return ModelType.IFT
84
+ return ModelType.Unknown
85
+
86
+ class WeightType(Enum):
87
+ Adapter = ModelDetails("Adapter")
88
+ Original = ModelDetails("Original")
89
+ Delta = ModelDetails("Delta")
90
+
91
+ class Precision(Enum):
92
+ float16 = ModelDetails("float16")
93
+ bfloat16 = ModelDetails("bfloat16")
94
+ Unknown = ModelDetails("?")
95
+
96
+ def from_str(precision):
97
+ if precision in ["torch.float16", "float16"]:
98
+ return Precision.float16
99
+ if precision in ["torch.bfloat16", "bfloat16"]:
100
+ return Precision.bfloat16
101
+ return Precision.Unknown
102
+
103
+ # Column selection
104
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
+
106
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
+
109
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
+
src/envs.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "leukas" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/cute_leaderboard"
13
+ RESULTS_REPO = f"{OWNER}/cute_results"
14
+
15
+ # If you setup a cache later, just change HF_HOME
16
+ CACHE_PATH=os.getenv("HF_HOME", ".")
17
+
18
+ # Local caches
19
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
20
+
21
+ API = HfApi(token=TOKEN)