Spaces:

open-llm-leaderboard
/

comparator

Running

App Files Files Community

albertvillanova HF staff commited on 28 days ago

Commit

d9f31f1

•

1 Parent(s): 07448fb

Create constants module

Browse files

Files changed (2) hide show

app.py +3 -79
constants.py +74 -0

app.py CHANGED Viewed

@@ -1,86 +1,10 @@
-import io
 import json
 import gradio as gr
 import pandas as pd
 from huggingface_hub import HfFileSystem
-RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
-EXCLUDED_KEYS =  {
-    "pretty_env_info",
-    "chat_template",
-    "group_subtasks",
-}
-# EXCLUDED_RESULTS_KEYS = {
-#     "leaderboard",
-# }
-# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
-#     "alias",
-# }
-DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
-DETAILS_FILENAME = "samples_{subtask}_*.json"
-TASKS = {
-    "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
-    "leaderboard_bbh": ("BBH", "leaderboard_bbh"),
-    "leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
-    "leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
-    "leaderboard_math_hard": ("MATH", "leaderboard_math"),
-    "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
-    "leaderboard_musr": ("MuSR", "leaderboard_musr"),
-}
-SUBTASKS = {
-    "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
-    "leaderboard_bbh": [
-        "leaderboard_bbh_boolean_expressions",
-        "leaderboard_bbh_causal_judgement",
-        "leaderboard_bbh_date_understanding",
-        "leaderboard_bbh_disambiguation_qa",
-        "leaderboard_bbh_formal_fallacies",
-        "leaderboard_bbh_geometric_shapes",
-        "leaderboard_bbh_hyperbaton",
-        "leaderboard_bbh_logical_deduction_five_objects",
-        "leaderboard_bbh_logical_deduction_seven_objects",
-        "leaderboard_bbh_logical_deduction_three_objects",
-        "leaderboard_bbh_movie_recommendation",
-        "leaderboard_bbh_navigate",
-        "leaderboard_bbh_object_counting",
-        "leaderboard_bbh_penguins_in_a_table",
-        "leaderboard_bbh_reasoning_about_colored_objects",
-        "leaderboard_bbh_ruin_names",
-        "leaderboard_bbh_salient_translation_error_detection",
-        "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
-        "leaderboard_bbh_temporal_sequences",
-        "leaderboard_bbh_tracking_shuffled_objects_five_objects",
-        "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
-        "leaderboard_bbh_tracking_shuffled_objects_three_objects",
-        "leaderboard_bbh_web_of_lies",
-    ],
-    "leaderboard_gpqa": [
-        "leaderboard_gpqa_extended",
-        "leaderboard_gpqa_diamond",
-        "leaderboard_gpqa_main",
-    ],
-    "leaderboard_ifeval": ["leaderboard_ifeval"],
-    # "leaderboard_math_hard": [
-    "leaderboard_math": [
-        "leaderboard_math_algebra_hard",
-        "leaderboard_math_counting_and_prob_hard",
-        "leaderboard_math_geometry_hard",
-        "leaderboard_math_intermediate_algebra_hard",
-        "leaderboard_math_num_theory_hard",
-        "leaderboard_math_prealgebra_hard",
-        "leaderboard_math_precalculus_hard",
-    ],
-    "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
-    "leaderboard_musr": [
-        "leaderboard_musr_murder_mysteries",
-        "leaderboard_musr_object_placements",
-        "leaderboard_musr_team_allocation",
-    ],
-}
 fs = HfFileSystem()
@@ -96,7 +20,7 @@ def filter_latest_result_path_per_model(paths):
     d = defaultdict(list)
     for path in paths:
-        model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
         d[model_id].append(path)
     return {model_id: max(paths) for model_id, paths in d.items()}
@@ -121,7 +45,7 @@ def load_results_dataframe(model_id):
     result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
     data = load_data(result_path)
     model_name = data.get("model_name", "Model")
-    df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
     return df.set_index(pd.Index([model_name])).reset_index()

 import json
 import gradio as gr
 import pandas as pd
 from huggingface_hub import HfFileSystem
+from constants import DETAILS_DATASET_ID, DETAILS_FILENAME, RESULTS_DATASET_ID, SUBTASKS, TASKS
 fs = HfFileSystem()
     d = defaultdict(list)
     for path in paths:
+        model_id, _ = path[len(RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
         d[model_id].append(path)
     return {model_id: max(paths) for model_id, paths in d.items()}
     result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
     data = load_data(result_path)
     model_name = data.get("model_name", "Model")
+    df = pd.json_normalize([{key: value for key, value in data.items()}])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
     return df.set_index(pd.Index([model_name])).reset_index()

constants.py ADDED Viewed

	@@ -0,0 +1,74 @@

+RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
+# EXCLUDED_KEYS =  {
+#     "pretty_env_info",
+#     "chat_template",
+#     "group_subtasks",
+# }
+# EXCLUDED_RESULTS_KEYS = {
+#     "leaderboard",
+# }
+# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
+#     "alias",
+# }
+DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
+DETAILS_FILENAME = "samples_{subtask}_*.json"
+TASKS = {
+    "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
+    "leaderboard_bbh": ("BBH", "leaderboard_bbh"),
+    "leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
+    "leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
+    "leaderboard_math_hard": ("MATH", "leaderboard_math"),
+    "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
+    "leaderboard_musr": ("MuSR", "leaderboard_musr"),
+}
+SUBTASKS = {
+    "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
+    "leaderboard_bbh": [
+        "leaderboard_bbh_boolean_expressions",
+        "leaderboard_bbh_causal_judgement",
+        "leaderboard_bbh_date_understanding",
+        "leaderboard_bbh_disambiguation_qa",
+        "leaderboard_bbh_formal_fallacies",
+        "leaderboard_bbh_geometric_shapes",
+        "leaderboard_bbh_hyperbaton",
+        "leaderboard_bbh_logical_deduction_five_objects",
+        "leaderboard_bbh_logical_deduction_seven_objects",
+        "leaderboard_bbh_logical_deduction_three_objects",
+        "leaderboard_bbh_movie_recommendation",
+        "leaderboard_bbh_navigate",
+        "leaderboard_bbh_object_counting",
+        "leaderboard_bbh_penguins_in_a_table",
+        "leaderboard_bbh_reasoning_about_colored_objects",
+        "leaderboard_bbh_ruin_names",
+        "leaderboard_bbh_salient_translation_error_detection",
+        "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
+        "leaderboard_bbh_temporal_sequences",
+        "leaderboard_bbh_tracking_shuffled_objects_five_objects",
+        "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
+        "leaderboard_bbh_tracking_shuffled_objects_three_objects",
+        "leaderboard_bbh_web_of_lies",
+    ],
+    "leaderboard_gpqa": [
+        "leaderboard_gpqa_extended",
+        "leaderboard_gpqa_diamond",
+        "leaderboard_gpqa_main",
+    ],
+    "leaderboard_ifeval": ["leaderboard_ifeval"],
+    # "leaderboard_math_hard": [
+    "leaderboard_math": [
+        "leaderboard_math_algebra_hard",
+        "leaderboard_math_counting_and_prob_hard",
+        "leaderboard_math_geometry_hard",
+        "leaderboard_math_intermediate_algebra_hard",
+        "leaderboard_math_num_theory_hard",
+        "leaderboard_math_prealgebra_hard",
+        "leaderboard_math_precalculus_hard",
+    ],
+    "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
+    "leaderboard_musr": [
+        "leaderboard_musr_murder_mysteries",
+        "leaderboard_musr_object_placements",
+        "leaderboard_musr_team_allocation",
+    ],
+}