Spaces:

leaderboards
/

LeaderboardsExplorer

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Mar 21

Commit

4b2522c

•

1 Parent(s): 509661e

need to add the selectors now

Browse files

Files changed (9) hide show

.gitignore +2 -0
README.md +47 -1
app.py +33 -0
leaderboards_metadata.py +0 -107
requirements.txt +1 -0
src/leaderboards/get_from_hub.py +66 -0
src/leaderboards/saved.py +42 -0
src/static/about.py +64 -0
src/static/env.py +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pyc
2	+ .vscode

README.md CHANGED Viewed

@@ -9,4 +9,50 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+If you want your leaderboard to appear, feel free to add relevant information in its metadata, and it will be displayed here.
+# Categories
+## Submission type
+Arenas are not concerned by this category.
+- `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
+- `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
+- `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
+- `submission:closed`: the leaderboard does not accept submissions at the moment
+## Test set status
+Arenas are not concerned by this category.
+- `test:public`: all the test sets used are public, the evaluations are completely reproducible
+- `test:mix`: some test sets are public and some private
+- `test:private`: all the test sets used are private, the evaluations are hard to game
+- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
+## Judges
+- `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
+- `judge:model`: evaluations are run using a model as a judge approach to rate answer
+- `judge:humans`: evaluations are done by humans to rate answer - this is an arena
+- `judge:vibe_check`: evaluations are done manually by one human
+## Modalities
+Can be any (or several) of the following list:
+- `modality:text`
+- `modality:image`
+- `modality:video`
+- `modality:audio`
+A bit outside of usual modalities
+- `modality:tools`: requires added tool usage - mostly for assistant models
+- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
+## Evaluation categories
+Can be any (or several) of the following list:
+- `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
+- `eval:math`
+- `eval:code`
+- `eval:performance`: model performance (speed, energy consumption, ...)
+- `eval:safety`: safety, toxicity, bias evaluations
+## Language
+You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
+At the moment, we do not support language codes, please use the language name in English.

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
+from src.static.env import API, REPO_ID, HF_TOKEN
+from src.static.about import TITLE, INTRO, ABOUT
+from src.leaderboards.get_from_hub import get_leaderboard_info
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
+leaderboards_to_info, info_to_leaderboards = get_leaderboard_info()
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRO, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("Search"):
+            gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
+        with gr.TabItem("About"):
+            gr.Markdown(ABOUT, elem_classes="markdown-text")
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

leaderboards_metadata.py DELETED Viewed

@@ -1,107 +0,0 @@
-from enum import Enum, auto
-#from dataclasses import dataclass
-SubmissionType = Enum(
-    "SubmissionType",
-    [
-        "Automatic",
-        "SemiAutomatic",
-        "Manual",
-        "Closed",
-        "Arena"
-    ]
-)
-Evaluators = Enum(
-    "Evaluators",
-    [
-        "Humans", # Arena
-        "Automatic",
-        "Model"
-    ]
-)
-TestSet = Enum(
-    "TestSet",
-    [
-        "Private",
-        "Public",
-        "Mix",
-        "Rolling",
-        "N/A"
-    ]
-)
-Categories = Enum(
-    "Categories",
-    [
-        "Text",
-        "Image",
-        "Audio",
-        "Video",
-        "Multimodal",
-        "Generation",
-        "Math",
-        "Code",
-        "LanguageSpecific",
-        "Performance",
-        "Safety",
-        "VibeCheck",
-        "Tools",
-        "Artefacts"
-    ]
-)
-Languages = Enum(
-    "Languages",
-    [
-        "Chinese",
-        "Korean",
-        "Dutch",
-        "Portuguese",
-        "Italian",
-        "Malay",
-        "Polish",
-        "Turkish"
-    ]
-)
-leaderboard_to_tags = {
-    "HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
-    "bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
-    "optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
-    "lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
-    "llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
-    "mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
-    "gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
-    "opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
-    "upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
-    "BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
-    "vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
-    "facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
-    "mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
-    "AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
-    "AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
-    "mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
-    "echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
-    "NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
-    "HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
-    "devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
-    "WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
-    "Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
-    "eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
-    "FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
-    "mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
-    "TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
-    "q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
-    "OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
-    "speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
-    "malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
-    "allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
-    "hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
-    "opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
-    "livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
-    "allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
-    "TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
-}

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ huggingface_hub

src/leaderboards/get_from_hub.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from collections import defaultdict
+from src.leaderboards.saved import leaderboard_to_tags
+from src.static.env import API
+def group_all_tags(input_tags: list[str]) -> dict:
+    """Groups the tags by categories, following the division in the README.
+    Args:
+        input_tags (list[str]): list of tags
+    Returns:
+        dict: category to tag list
+    """
+    output_tags = defaultdict(list)
+    for tag in input_tags:
+        if tag == "arena":
+            output_tags.append("judge:humans")
+            continue
+        try:
+            category, value = tag.split(":")
+            output_tags[category].append(value)
+        except ValueError:
+            continue
+    return output_tags
+def get_leaderboard_info() -> tuple[list, dict]:
+    """Looks up all spaces tagged as leaderboards or arenas on the hub,
+    and homogeneizes their tags.
+    Returns:
+        dict: All leaderboard names to their tag dicts by category
+    """
+    leaderboards = [
+        (s.id, s.tags) for s in API.list_spaces(
+        filter=["leaderboard"]
+    )]
+    arenas = [
+        (s.id, s.tags) for s in API.list_spaces(
+        filter=["arena"]
+    )]
+    saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
+    seen_leaderboards = []
+    leaderboard_df = []
+    info_to_leaderboard = defaultdict(lambda: defaultdict(list))
+    for name, tags in leaderboards + arenas + saved_leaderboards:
+        if name in seen_leaderboards:
+            continue
+        seen_leaderboards.append(name)
+        if name in leaderboard_to_tags:
+            tags += leaderboard_to_tags[name]
+        grouped_tags = group_all_tags(tags)
+        current_info = grouped_tags
+        current_info["name"] = name
+        leaderboard_df.append(current_info)
+        for category, tags in grouped_tags.items():
+            for tag in tags:
+                info_to_leaderboard[category][tag].append(name)
+    return leaderboard_df, info_to_leaderboard

src/leaderboards/saved.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Default leaderboards with which we initialize the space.
+"""
+leaderboard_to_tags = {
+    "HuggingFaceH4/open_llm_leaderboard": ["submission:automatic", "judge:auto", "test:public", "modality:text", "eval:math"],
+    "bigcode/bigcode-models-leaderboard": ["submission:semiautomatic", "judge:auto", "test:public", "eval:code"],
+    "optimum/llm-perf-leaderboard": ["submission:manual", "judge:auto", "eval:performance"],
+    "lmsys/chatbot-arena-leaderboard": ["judge:humans", "modality:text", "eval:generation"],
+    "llmonitor/benchmarks": ["submission:manual", "judge:humans", "modality:text", "judge:vibe_check"],
+    "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
+    "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
+    "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
+    "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
+    "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
+    "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
+    "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
+    "mlabonne/Yet_Another_LLM_Leaderboard": ["submission:manual", "modality:text", "judge:auto"],
+    "AI-Secure/llm-trustworthy-leaderboard": ["submission:automatic", "eval:safety", "modality:text"],
+    "AILab-CVC/EvalCrafter": ["submission:closed", "modality:video", "eval:generation"],
+    "mike-ravkine/can-ai-code-results": ["submission:closed", "eval:code"],
+    "echo840/ocrbench-leaderboard": ["submission:closed", "modality:image", "OCR"],
+    "NPHardEval/NPHardEval-leaderboard": ["submission:closed", "modality:text", "eval:math", "test:rolling"],
+    "HaizeLabs/red-teaming-resistance-benchmark": ["submission:manual", "eval:safety", "modality:text"],
+    "devingulliver/subquadratic-llm-leaderboard": ["submission:semiautomatic", "modality:text", "eval:math"],
+    "WildVision/vision-arena": ["modality:image", "modality:text", "judge:humans"],
+    "Vchitect/VBench_Leaderboard": ["submission:semiautomatic", "modality:video", "eval:generation"],
+    "eduagarcia/open_pt_llm_leaderboard": ["modality:text", "language:portuguese"],
+    "FinancialSupport/open_ita_llm_leaderboard": ["modality:text", "language:italian"],
+    "mesolitica/malay-llm-leaderboard": ["modality:text", "language:malay"],
+    "TIGER-Lab/GenAI-Arena": ["modality:image", "eval:generation", "judge:humans", ],
+    "q-future/Q-Bench-Leaderboard": ["modality:image", "judge:auto", "submission:closed"],
+    "OpenGenAI/parti-prompts-leaderboard": ["modality:image", "eval:generation", "judge:humans"],
+    "speakleash/open_pl_llm_leaderboard": ["modality:text", "language:polish"],
+    "malhajar/OpenLLMTurkishLeaderboard": ["modality:text", "language:turkish"],
+    "allenai/WildBench": ["judge:humans", "judge:model", "modality:text", "eval:generation"],
+    "hf-audio/open_asr_leaderboard": ["judge:auto", "modality:audio"],
+    "opencompass/open_vlm_leaderboard": ["judge:auto", "eval:generation", "modality:image"],
+    "livecodebench/benchmarks": ["judge:auto", "eval:code"],
+    "allenai/reward-bench": ["judge:auto", "modality:artefacts", "Models", "modality:text"],
+    "TTS-AGI/TTS-Arena": ["judge:humans", "modality:audio"]
+}

src/static/about.py ADDED Viewed

	@@ -0,0 +1,64 @@

+TITLE = "# Leaderboard explorer"
+INTRO = """
+Have you ever wondered which leaderboard would be best for your use case?
+"""
+ABOUT = """
+If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
+# First step
+Make sure to either use the tag `leaderboard` or `arena` to your space, by adding the following to your README
+```
+tags:
+  - leaderboard
+```
+# Extra tags
+## Submission type
+Arenas are not concerned by this category.
+- `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
+- `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
+- `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
+- `submission:closed`: the leaderboard does not accept submissions at the moment
+## Test set status
+Arenas are not concerned by this category.
+- `test:public`: all the test sets used are public, the evaluations are completely reproducible
+- `test:mix`: some test sets are public and some private
+- `test:private`: all the test sets used are private, the evaluations are hard to game
+- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
+## Judges
+- `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
+- `judge:model`: evaluations are run using a model as a judge approach to rate answer
+- `judge:humans`: evaluations are done by humans to rate answer - this is an arena
+- `judge:vibe_check`: evaluations are done manually by one human
+## Modalities
+Can be any (or several) of the following list:
+- `modality:text`
+- `modality:image`
+- `modality:video`
+- `modality:audio`
+A bit outside of usual modalities
+- `modality:tools`: requires added tool usage - mostly for assistant models
+- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
+## Evaluation categories
+Can be any (or several) of the following list:
+- `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
+- `eval:math`
+- `eval:code`
+- `eval:performance`: model performance (speed, energy consumption, ...)
+- `eval:safety`: safety, toxicity, bias evaluations
+## Language
+You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
+At the moment, we do not support language codes, please use the language name in English.
+"""

src/static/env.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+from huggingface_hub import HfApi
+REPO_ID = "clefourrier/LeaderboardFinder"
+HF_TOKEN = None #os.getenv("HF_TOKEN")
+API = HfApi(HF_TOKEN)