albertvillanova HF staff commited on
Commit
d9f31f1
1 Parent(s): 07448fb

Create constants module

Browse files
Files changed (2) hide show
  1. app.py +3 -79
  2. constants.py +74 -0
app.py CHANGED
@@ -1,86 +1,10 @@
1
- import io
2
  import json
3
 
4
  import gradio as gr
5
  import pandas as pd
6
  from huggingface_hub import HfFileSystem
7
 
8
-
9
- RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
10
- EXCLUDED_KEYS = {
11
- "pretty_env_info",
12
- "chat_template",
13
- "group_subtasks",
14
- }
15
- # EXCLUDED_RESULTS_KEYS = {
16
- # "leaderboard",
17
- # }
18
- # EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
19
- # "alias",
20
- # }
21
-
22
- DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
23
- DETAILS_FILENAME = "samples_{subtask}_*.json"
24
-
25
- TASKS = {
26
- "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
27
- "leaderboard_bbh": ("BBH", "leaderboard_bbh"),
28
- "leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
29
- "leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
30
- "leaderboard_math_hard": ("MATH", "leaderboard_math"),
31
- "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
32
- "leaderboard_musr": ("MuSR", "leaderboard_musr"),
33
- }
34
- SUBTASKS = {
35
- "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
36
- "leaderboard_bbh": [
37
- "leaderboard_bbh_boolean_expressions",
38
- "leaderboard_bbh_causal_judgement",
39
- "leaderboard_bbh_date_understanding",
40
- "leaderboard_bbh_disambiguation_qa",
41
- "leaderboard_bbh_formal_fallacies",
42
- "leaderboard_bbh_geometric_shapes",
43
- "leaderboard_bbh_hyperbaton",
44
- "leaderboard_bbh_logical_deduction_five_objects",
45
- "leaderboard_bbh_logical_deduction_seven_objects",
46
- "leaderboard_bbh_logical_deduction_three_objects",
47
- "leaderboard_bbh_movie_recommendation",
48
- "leaderboard_bbh_navigate",
49
- "leaderboard_bbh_object_counting",
50
- "leaderboard_bbh_penguins_in_a_table",
51
- "leaderboard_bbh_reasoning_about_colored_objects",
52
- "leaderboard_bbh_ruin_names",
53
- "leaderboard_bbh_salient_translation_error_detection",
54
- "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
55
- "leaderboard_bbh_temporal_sequences",
56
- "leaderboard_bbh_tracking_shuffled_objects_five_objects",
57
- "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
58
- "leaderboard_bbh_tracking_shuffled_objects_three_objects",
59
- "leaderboard_bbh_web_of_lies",
60
- ],
61
- "leaderboard_gpqa": [
62
- "leaderboard_gpqa_extended",
63
- "leaderboard_gpqa_diamond",
64
- "leaderboard_gpqa_main",
65
- ],
66
- "leaderboard_ifeval": ["leaderboard_ifeval"],
67
- # "leaderboard_math_hard": [
68
- "leaderboard_math": [
69
- "leaderboard_math_algebra_hard",
70
- "leaderboard_math_counting_and_prob_hard",
71
- "leaderboard_math_geometry_hard",
72
- "leaderboard_math_intermediate_algebra_hard",
73
- "leaderboard_math_num_theory_hard",
74
- "leaderboard_math_prealgebra_hard",
75
- "leaderboard_math_precalculus_hard",
76
- ],
77
- "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
78
- "leaderboard_musr": [
79
- "leaderboard_musr_murder_mysteries",
80
- "leaderboard_musr_object_placements",
81
- "leaderboard_musr_team_allocation",
82
- ],
83
- }
84
 
85
 
86
  fs = HfFileSystem()
@@ -96,7 +20,7 @@ def filter_latest_result_path_per_model(paths):
96
 
97
  d = defaultdict(list)
98
  for path in paths:
99
- model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
100
  d[model_id].append(path)
101
  return {model_id: max(paths) for model_id, paths in d.items()}
102
 
@@ -121,7 +45,7 @@ def load_results_dataframe(model_id):
121
  result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
122
  data = load_data(result_path)
123
  model_name = data.get("model_name", "Model")
124
- df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}])
125
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
126
  return df.set_index(pd.Index([model_name])).reset_index()
127
 
 
 
1
  import json
2
 
3
  import gradio as gr
4
  import pandas as pd
5
  from huggingface_hub import HfFileSystem
6
 
7
+ from constants import DETAILS_DATASET_ID, DETAILS_FILENAME, RESULTS_DATASET_ID, SUBTASKS, TASKS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  fs = HfFileSystem()
 
20
 
21
  d = defaultdict(list)
22
  for path in paths:
23
+ model_id, _ = path[len(RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
24
  d[model_id].append(path)
25
  return {model_id: max(paths) for model_id, paths in d.items()}
26
 
 
45
  result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
46
  data = load_data(result_path)
47
  model_name = data.get("model_name", "Model")
48
+ df = pd.json_normalize([{key: value for key, value in data.items()}])
49
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
50
  return df.set_index(pd.Index([model_name])).reset_index()
51
 
constants.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
2
+ # EXCLUDED_KEYS = {
3
+ # "pretty_env_info",
4
+ # "chat_template",
5
+ # "group_subtasks",
6
+ # }
7
+ # EXCLUDED_RESULTS_KEYS = {
8
+ # "leaderboard",
9
+ # }
10
+ # EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
11
+ # "alias",
12
+ # }
13
+
14
+ DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
15
+ DETAILS_FILENAME = "samples_{subtask}_*.json"
16
+ TASKS = {
17
+ "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
18
+ "leaderboard_bbh": ("BBH", "leaderboard_bbh"),
19
+ "leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
20
+ "leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
21
+ "leaderboard_math_hard": ("MATH", "leaderboard_math"),
22
+ "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
23
+ "leaderboard_musr": ("MuSR", "leaderboard_musr"),
24
+ }
25
+ SUBTASKS = {
26
+ "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
27
+ "leaderboard_bbh": [
28
+ "leaderboard_bbh_boolean_expressions",
29
+ "leaderboard_bbh_causal_judgement",
30
+ "leaderboard_bbh_date_understanding",
31
+ "leaderboard_bbh_disambiguation_qa",
32
+ "leaderboard_bbh_formal_fallacies",
33
+ "leaderboard_bbh_geometric_shapes",
34
+ "leaderboard_bbh_hyperbaton",
35
+ "leaderboard_bbh_logical_deduction_five_objects",
36
+ "leaderboard_bbh_logical_deduction_seven_objects",
37
+ "leaderboard_bbh_logical_deduction_three_objects",
38
+ "leaderboard_bbh_movie_recommendation",
39
+ "leaderboard_bbh_navigate",
40
+ "leaderboard_bbh_object_counting",
41
+ "leaderboard_bbh_penguins_in_a_table",
42
+ "leaderboard_bbh_reasoning_about_colored_objects",
43
+ "leaderboard_bbh_ruin_names",
44
+ "leaderboard_bbh_salient_translation_error_detection",
45
+ "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
46
+ "leaderboard_bbh_temporal_sequences",
47
+ "leaderboard_bbh_tracking_shuffled_objects_five_objects",
48
+ "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
49
+ "leaderboard_bbh_tracking_shuffled_objects_three_objects",
50
+ "leaderboard_bbh_web_of_lies",
51
+ ],
52
+ "leaderboard_gpqa": [
53
+ "leaderboard_gpqa_extended",
54
+ "leaderboard_gpqa_diamond",
55
+ "leaderboard_gpqa_main",
56
+ ],
57
+ "leaderboard_ifeval": ["leaderboard_ifeval"],
58
+ # "leaderboard_math_hard": [
59
+ "leaderboard_math": [
60
+ "leaderboard_math_algebra_hard",
61
+ "leaderboard_math_counting_and_prob_hard",
62
+ "leaderboard_math_geometry_hard",
63
+ "leaderboard_math_intermediate_algebra_hard",
64
+ "leaderboard_math_num_theory_hard",
65
+ "leaderboard_math_prealgebra_hard",
66
+ "leaderboard_math_precalculus_hard",
67
+ ],
68
+ "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
69
+ "leaderboard_musr": [
70
+ "leaderboard_musr_murder_mysteries",
71
+ "leaderboard_musr_object_placements",
72
+ "leaderboard_musr_team_allocation",
73
+ ],
74
+ }