Commit
•
d9f31f1
1
Parent(s):
07448fb
Create constants module
Browse files- app.py +3 -79
- constants.py +74 -0
app.py
CHANGED
@@ -1,86 +1,10 @@
|
|
1 |
-
import io
|
2 |
import json
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
from huggingface_hub import HfFileSystem
|
7 |
|
8 |
-
|
9 |
-
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
10 |
-
EXCLUDED_KEYS = {
|
11 |
-
"pretty_env_info",
|
12 |
-
"chat_template",
|
13 |
-
"group_subtasks",
|
14 |
-
}
|
15 |
-
# EXCLUDED_RESULTS_KEYS = {
|
16 |
-
# "leaderboard",
|
17 |
-
# }
|
18 |
-
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
|
19 |
-
# "alias",
|
20 |
-
# }
|
21 |
-
|
22 |
-
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
23 |
-
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
24 |
-
|
25 |
-
TASKS = {
|
26 |
-
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
27 |
-
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
|
28 |
-
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
|
29 |
-
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
|
30 |
-
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
|
31 |
-
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
32 |
-
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
33 |
-
}
|
34 |
-
SUBTASKS = {
|
35 |
-
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
36 |
-
"leaderboard_bbh": [
|
37 |
-
"leaderboard_bbh_boolean_expressions",
|
38 |
-
"leaderboard_bbh_causal_judgement",
|
39 |
-
"leaderboard_bbh_date_understanding",
|
40 |
-
"leaderboard_bbh_disambiguation_qa",
|
41 |
-
"leaderboard_bbh_formal_fallacies",
|
42 |
-
"leaderboard_bbh_geometric_shapes",
|
43 |
-
"leaderboard_bbh_hyperbaton",
|
44 |
-
"leaderboard_bbh_logical_deduction_five_objects",
|
45 |
-
"leaderboard_bbh_logical_deduction_seven_objects",
|
46 |
-
"leaderboard_bbh_logical_deduction_three_objects",
|
47 |
-
"leaderboard_bbh_movie_recommendation",
|
48 |
-
"leaderboard_bbh_navigate",
|
49 |
-
"leaderboard_bbh_object_counting",
|
50 |
-
"leaderboard_bbh_penguins_in_a_table",
|
51 |
-
"leaderboard_bbh_reasoning_about_colored_objects",
|
52 |
-
"leaderboard_bbh_ruin_names",
|
53 |
-
"leaderboard_bbh_salient_translation_error_detection",
|
54 |
-
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
55 |
-
"leaderboard_bbh_temporal_sequences",
|
56 |
-
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
57 |
-
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
58 |
-
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
59 |
-
"leaderboard_bbh_web_of_lies",
|
60 |
-
],
|
61 |
-
"leaderboard_gpqa": [
|
62 |
-
"leaderboard_gpqa_extended",
|
63 |
-
"leaderboard_gpqa_diamond",
|
64 |
-
"leaderboard_gpqa_main",
|
65 |
-
],
|
66 |
-
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
67 |
-
# "leaderboard_math_hard": [
|
68 |
-
"leaderboard_math": [
|
69 |
-
"leaderboard_math_algebra_hard",
|
70 |
-
"leaderboard_math_counting_and_prob_hard",
|
71 |
-
"leaderboard_math_geometry_hard",
|
72 |
-
"leaderboard_math_intermediate_algebra_hard",
|
73 |
-
"leaderboard_math_num_theory_hard",
|
74 |
-
"leaderboard_math_prealgebra_hard",
|
75 |
-
"leaderboard_math_precalculus_hard",
|
76 |
-
],
|
77 |
-
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
78 |
-
"leaderboard_musr": [
|
79 |
-
"leaderboard_musr_murder_mysteries",
|
80 |
-
"leaderboard_musr_object_placements",
|
81 |
-
"leaderboard_musr_team_allocation",
|
82 |
-
],
|
83 |
-
}
|
84 |
|
85 |
|
86 |
fs = HfFileSystem()
|
@@ -96,7 +20,7 @@ def filter_latest_result_path_per_model(paths):
|
|
96 |
|
97 |
d = defaultdict(list)
|
98 |
for path in paths:
|
99 |
-
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
|
100 |
d[model_id].append(path)
|
101 |
return {model_id: max(paths) for model_id, paths in d.items()}
|
102 |
|
@@ -121,7 +45,7 @@ def load_results_dataframe(model_id):
|
|
121 |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
|
122 |
data = load_data(result_path)
|
123 |
model_name = data.get("model_name", "Model")
|
124 |
-
df = pd.json_normalize([{key: value for key, value in data.items()
|
125 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
126 |
return df.set_index(pd.Index([model_name])).reset_index()
|
127 |
|
|
|
|
|
1 |
import json
|
2 |
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
from huggingface_hub import HfFileSystem
|
6 |
|
7 |
+
from constants import DETAILS_DATASET_ID, DETAILS_FILENAME, RESULTS_DATASET_ID, SUBTASKS, TASKS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
fs = HfFileSystem()
|
|
|
20 |
|
21 |
d = defaultdict(list)
|
22 |
for path in paths:
|
23 |
+
model_id, _ = path[len(RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
|
24 |
d[model_id].append(path)
|
25 |
return {model_id: max(paths) for model_id, paths in d.items()}
|
26 |
|
|
|
45 |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
|
46 |
data = load_data(result_path)
|
47 |
model_name = data.get("model_name", "Model")
|
48 |
+
df = pd.json_normalize([{key: value for key, value in data.items()}])
|
49 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
50 |
return df.set_index(pd.Index([model_name])).reset_index()
|
51 |
|
constants.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
2 |
+
# EXCLUDED_KEYS = {
|
3 |
+
# "pretty_env_info",
|
4 |
+
# "chat_template",
|
5 |
+
# "group_subtasks",
|
6 |
+
# }
|
7 |
+
# EXCLUDED_RESULTS_KEYS = {
|
8 |
+
# "leaderboard",
|
9 |
+
# }
|
10 |
+
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
|
11 |
+
# "alias",
|
12 |
+
# }
|
13 |
+
|
14 |
+
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
15 |
+
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
16 |
+
TASKS = {
|
17 |
+
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
18 |
+
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
|
19 |
+
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
|
20 |
+
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
|
21 |
+
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
|
22 |
+
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
23 |
+
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
24 |
+
}
|
25 |
+
SUBTASKS = {
|
26 |
+
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
27 |
+
"leaderboard_bbh": [
|
28 |
+
"leaderboard_bbh_boolean_expressions",
|
29 |
+
"leaderboard_bbh_causal_judgement",
|
30 |
+
"leaderboard_bbh_date_understanding",
|
31 |
+
"leaderboard_bbh_disambiguation_qa",
|
32 |
+
"leaderboard_bbh_formal_fallacies",
|
33 |
+
"leaderboard_bbh_geometric_shapes",
|
34 |
+
"leaderboard_bbh_hyperbaton",
|
35 |
+
"leaderboard_bbh_logical_deduction_five_objects",
|
36 |
+
"leaderboard_bbh_logical_deduction_seven_objects",
|
37 |
+
"leaderboard_bbh_logical_deduction_three_objects",
|
38 |
+
"leaderboard_bbh_movie_recommendation",
|
39 |
+
"leaderboard_bbh_navigate",
|
40 |
+
"leaderboard_bbh_object_counting",
|
41 |
+
"leaderboard_bbh_penguins_in_a_table",
|
42 |
+
"leaderboard_bbh_reasoning_about_colored_objects",
|
43 |
+
"leaderboard_bbh_ruin_names",
|
44 |
+
"leaderboard_bbh_salient_translation_error_detection",
|
45 |
+
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
46 |
+
"leaderboard_bbh_temporal_sequences",
|
47 |
+
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
48 |
+
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
49 |
+
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
50 |
+
"leaderboard_bbh_web_of_lies",
|
51 |
+
],
|
52 |
+
"leaderboard_gpqa": [
|
53 |
+
"leaderboard_gpqa_extended",
|
54 |
+
"leaderboard_gpqa_diamond",
|
55 |
+
"leaderboard_gpqa_main",
|
56 |
+
],
|
57 |
+
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
58 |
+
# "leaderboard_math_hard": [
|
59 |
+
"leaderboard_math": [
|
60 |
+
"leaderboard_math_algebra_hard",
|
61 |
+
"leaderboard_math_counting_and_prob_hard",
|
62 |
+
"leaderboard_math_geometry_hard",
|
63 |
+
"leaderboard_math_intermediate_algebra_hard",
|
64 |
+
"leaderboard_math_num_theory_hard",
|
65 |
+
"leaderboard_math_prealgebra_hard",
|
66 |
+
"leaderboard_math_precalculus_hard",
|
67 |
+
],
|
68 |
+
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
69 |
+
"leaderboard_musr": [
|
70 |
+
"leaderboard_musr_murder_mysteries",
|
71 |
+
"leaderboard_musr_object_placements",
|
72 |
+
"leaderboard_musr_team_allocation",
|
73 |
+
],
|
74 |
+
}
|