Spaces:
Paused
Paused
Clémentine
commited on
Commit
·
314f91a
1
Parent(s):
1257fc3
fixs
Browse files- src/display/about.py +2 -2
- src/display/utils.py +1 -0
- src/leaderboard/filter_models.py +0 -50
- src/populate.py +1 -4
- src/submission/submit.py +2 -2
src/display/about.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
from src.display.utils import ModelType
|
2 |
-
from enum import Enum
|
3 |
from dataclasses import dataclass
|
|
|
4 |
|
5 |
@dataclass
|
6 |
class Task:
|
@@ -8,6 +7,7 @@ class Task:
|
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
|
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
|
4 |
@dataclass
|
5 |
class Task:
|
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
10 |
+
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
|
src/display/utils.py
CHANGED
@@ -8,6 +8,7 @@ from src.display.about import Tasks
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
|
|
11 |
# These classes are for user facing column names,
|
12 |
# to avoid having to change them all around the code
|
13 |
# when a modif is needed
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
11 |
+
|
12 |
# These classes are for user facing column names,
|
13 |
# to avoid having to change them all around the code
|
14 |
# when a modif is needed
|
src/leaderboard/filter_models.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
from src.display.formatting import model_hyperlink
|
2 |
-
from src.display.utils import AutoEvalColumn
|
3 |
-
|
4 |
-
# Models which have been flagged by users as being problematic for a reason or another
|
5 |
-
# (Model name to forum discussion link)
|
6 |
-
FLAGGED_MODELS = {
|
7 |
-
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
|
8 |
-
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
|
9 |
-
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
|
10 |
-
"Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
|
11 |
-
"TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
|
12 |
-
"gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
|
13 |
-
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
14 |
-
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
15 |
-
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
16 |
-
}
|
17 |
-
|
18 |
-
# Models which have been requested by orgs to not be submitted on the leaderboard
|
19 |
-
DO_NOT_SUBMIT_MODELS = [
|
20 |
-
"Voicelab/trurl-2-13b", # trained on MMLU
|
21 |
-
]
|
22 |
-
|
23 |
-
|
24 |
-
def flag_models(leaderboard_data: list[dict]):
|
25 |
-
for model_data in leaderboard_data:
|
26 |
-
if model_data["model_name_for_query"] in FLAGGED_MODELS:
|
27 |
-
issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
|
28 |
-
issue_link = model_hyperlink(
|
29 |
-
FLAGGED_MODELS[model_data["model_name_for_query"]],
|
30 |
-
f"See discussion #{issue_num}",
|
31 |
-
)
|
32 |
-
model_data[
|
33 |
-
AutoEvalColumn.model.name
|
34 |
-
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
35 |
-
|
36 |
-
|
37 |
-
def remove_forbidden_models(leaderboard_data: list[dict]):
|
38 |
-
indices_to_remove = []
|
39 |
-
for ix, model in enumerate(leaderboard_data):
|
40 |
-
if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
|
41 |
-
indices_to_remove.append(ix)
|
42 |
-
|
43 |
-
for ix in reversed(indices_to_remove):
|
44 |
-
leaderboard_data.pop(ix)
|
45 |
-
return leaderboard_data
|
46 |
-
|
47 |
-
|
48 |
-
def filter_models(leaderboard_data: list[dict]):
|
49 |
-
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
50 |
-
flag_models(leaderboard_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
CHANGED
@@ -4,16 +4,13 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.filter_models import filter_models
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
all_data_json.append(baseline_row)
|
16 |
-
filter_models(all_data_json)
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
src/submission/submit.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
7 |
from src.submission.check_validity import (
|
8 |
already_submitted_models,
|
9 |
check_model_card,
|
@@ -45,7 +45,7 @@ def add_new_eval(
|
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=
|
49 |
if not base_model_on_hub:
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
from src.submission.check_validity import (
|
8 |
already_submitted_models,
|
9 |
check_model_card,
|
|
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
if weight_type in ["Delta", "Adapter"]:
|
48 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
if not base_model_on_hub:
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|