Spaces:

vetrovvlad
/

protobench

Sleeping

App Files Files Community

vtrv.vls commited on Jun 10

Commit

1639c46

•

1 Parent(s): f9d1508

Tabs test

Browse files

Files changed (3) hide show

app.py +63 -4
constants.py +315 -0
test.md +1 -0

app.py CHANGED Viewed

@@ -1,10 +1,69 @@
-import gradio as gr
-from utils import generate
 import os
 def gen(content):
     res = generate(content,'auth_token.json')
     return res
-demo = gr.Interface(fn=gen, inputs="text", outputs="text")
-demo.launch()

+import gradio
+import argparse
 import os
+from utils import generate
+from constants import css, js_code, js_light
+MERA_table = None
 def gen(content):
     res = generate(content,'auth_token.json')
     return res
+def tab_arena():
+    arena = gradio.Interface(fn=gen, inputs="text", outputs="text")
+    arena.launch()
+with open("_test.md", "r") as f:
+    TEST_MD = f.read()
+def build_demo():
+    # global original_dfs, available_models, gpt4t_dfs, haiku_dfs, llama_dfs
+    with gradio.Blocks(theme=gradio.themes.Soft(), css=css, js=js_light) as demo:
+        # gradio.HTML(BANNER, elem_id="banner")
+        # gradio.Markdown(HEADER_MD.replace("{model_num}", str(len(original_dfs["-1"]))), elem_classes="markdown-text")
+        with gradio.Tabs(elem_classes="tab-buttons") as tabs:
+            with gradio.TabItem("🐼 MERA leaderboard", elem_id="od-benchmark-tab-table", id=0):
+                gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
+                # _tab_leaderboard()
+            with gradio.TabItem("🆚 SBS by categories and criteria", elem_id="od-benchmark-tab-table", id=1):
+                gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
+            with gradio.TabItem("🥊 Model arena", elem_id="od-benchmark-tab-table", id=2):
+                tab_arena()
+                # _tab_explore()
+            with gradio.TabItem("💪 About MERA", elem_id="od-benchmark-tab-table", id=3):
+                gradio.Markdown(TEST_MD, elem_classes="markdown-text")
+        # gr.Markdown(f"Last updated on **{LAST_UPDATED}** | [Link to V1-legacy](https://huggingface.co/spaces/allenai/WildBench-V1-legacy)", elem_classes="markdown-text-small")
+        # with gr.Row():
+        #     with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
+        #         gr.Textbox(
+        #             value=CITATION_TEXT,
+        #             lines=7,
+        #             label="Copy the BibTeX snippet to cite this source",
+        #             elem_id="citation-button",
+        #             show_copy_button=True)
+                # ).style(show_copy_button=True)
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--share", action="store_true")
+    # parser.add_argument("--bench_table", help="Path to MERA table", default="data_dir/MERA_jun2024.jsonl")
+    args = parser.parse_args()
+    # data_load(args.result_file)
+    # TYPES = ["number", "markdown", "number"]
+    demo = build_demo()
+    demo.launch(share=args.share, height=3000, width="110%")
+    # demo = gradio.Interface(fn=gen, inputs="text", outputs="text")
+    # demo.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,315 @@

+from pathlib import Path
+from collections import OrderedDict
+# DEFAULT_K = "∞"
+DEFAULT_K = "1500"
+banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
+BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
+TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
+WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
+CITATION_TEXT = """@misc{wildbench2024,
+	title        = {WildBench: Benchmarking Language Models with Challenging Tasks from Real Users in the Wild},
+	author       = {Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze BrArena-Hardman and Abhilasha Ravichander and Valentina Pyatkin and Ronan Le Bras and Yejin Choi},
+	year         = 2024,
+	url			 = {https://huggingface.co/spaces/allenai/WildBench},
+}
+"""
+# make column_names as an ordered dict
+REWARD_MIX_COLUMN = "🆚 Reward-Mix (Avg)"
+MACRO_COLUMN =  "🆚 Reward (Macro)"
+column_names = OrderedDict({
+    "model_name": "Model",
+    "WB_score": "💯 WB Score",
+    "WB_score.task_macro": "💯 Score Macro",
+    # "Arena Elo (hard) - 2024-05-20": "LMSYS Elo",
+    "Arena Elo (hard-en) - 2024-06-06": "LMSYS Elo",
+    "Arena-Hard v0.1": "Arena-Hard",
+    "AE2.0 LC": "AE2-LCWR",
+    "AE2.0": "AE2-WR",
+    "#chars": "Length",
+    "Length": "Len",
+    "task_macro_reward": "🆚 Task-Macro",
+    # # "elo overall": "Overall Elo",
+    # 'Others': 'Misc',
+    # # "average": "Task-Avg Elo",
+    # f"mixture_of_rewards.K={K}": "🆚 🎯 Reward-Mix",
+    # f"gpt4t_reward.K={K}": "🆚 GPT4T",
+    # f"haiku_reward.K={K}": "🆚 Haiku",
+    # f"llama_reward.K={K}": "🆚 Llama2",
+})
+LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
+"""
+LEADERBOARD_REMARKS_MAIN = """
+**WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
+The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
+**WB Score** individually scores each model based on checklists.
+Evaluator is GPT-4-Turbo.
+"""
+LENGTH_MARGIN_DESC_MD = """To mitigate the length bias, we consider it a **Tie** when A is only **slightly** better than B but A is longer than B by more than K chars.
+🔒 for closed LLMs; 🚨 for newly added models;
+"""
+RANKING_COLUMN = REWARD_MIX_COLUMN
+ORDERED_COLUMN_NAMES = [
+    "Model",
+    MACRO_COLUMN,
+    "💯 Score Macro",
+    REWARD_MIX_COLUMN,
+    # "💯 WB Score",
+    "🆚 🎯 GPT4T",
+    "🆚 🎯 Haiku",
+    "🆚 🎯 Llama",
+    # "LMSYS Elo",
+    "LMSYS Elo",
+    "Arena-Hard",
+    "AE2-LCWR",
+    # "AE2-WR",
+    "Len",
+]
+all_task_types_raw = [
+    'Information seeking',
+    'Coding & Debugging',
+    'Math',
+    'Data Analysis',
+    'Planning',
+    'Reasoning',
+    'Creative Writing',
+    'Editing',
+    'Role playing',
+    'Advice seeking',
+    'Brainstorming',
+    # 'Others'
+]
+all_task_types = ['Creative Tasks', 'Planning & Reasoning', 'Math & Data Analysis', 'Information/Advice seeking', 'Coding & Debugging']
+TASK_NAME_MAPPING_RAW = {
+    'Information seeking': 'InfoSek',
+    'Creative Writing': 'CrtWrt',
+    'Coding & Debugging': 'Code',
+    'Reasoning': 'Reason',
+    'Editing': 'Edit',
+    'Math': 'Math',
+    'Planning': 'Plan',
+    'Brainstorming': 'Brnstrm',
+    'Role playing': 'RolPly',
+    'Advice seeking': 'AdvSek',
+    'Data Analysis': 'DataAna',
+}
+TASK_NAME_MAPPING = {
+    'Planning & Reasoning': '💭 Reason & Plan',
+    'Math & Data Analysis': '📊 Math & Data',
+    'Coding & Debugging': '💻 Code & Debug',
+    'Creative Tasks': '📝 Creative',
+    'Information/Advice seeking': 'ℹ️ Info Seek',
+}
+js_light = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'light') {
+        url.searchParams.set('__theme', 'light');
+        window.location.href = url.href;
+    }
+}
+"""
+js_code = """
+function scroll_top() {
+    console.log("Hello from Gradio!");
+    const bubbles = document.querySelectorAll('.bubble-wrap');
+    bubbles.forEach((bubble, index) => {
+        setTimeout(() => {
+            bubble.scrollTop = 0;
+        }, index * 100); // Delay of 100ms between each iteration
+    });
+}
+"""
+TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
+css = """
+code {
+    font-size: large;
+}
+footer {visibility: hidden}
+.top-left-LP{
+    margin-top: 6px;
+    margin-left: 5px;
+}
+.no_margin{
+    margin-top: 0px;
+    margin-left: 0px;
+    margin-right: 0px;
+    margin-bottom: 0px;
+    padding-top: 0px;
+    padding-left: 0px;
+    padding-right: 0px;
+    padding-bottom: 0px;
+}
+.markdown-text{font-size: 14pt}
+.markdown-text-tiny{font-size: 10pt}
+.markdown-text-small{font-size: 13pt}
+.markdown-text-tiny{font-size: 12pt}
+.markdown-text-tiny-red{
+    font-size: 12pt;
+    color: red;
+    background-color: yellow;
+    font-color: red;
+    font-weight: bold;
+}
+th {
+  text-align: center;
+  font-size: 17px; /* Adjust the font size as needed */
+}
+td {
+  font-size: 15px; /* Adjust the font size as needed */
+  text-align: center;
+}
+.sample_button{
+    border: 1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+    margin: 5px;
+}
+.chat-common{
+    height: auto;
+    max-height: 400px;
+    min-height: 100px;
+}
+.chat-specific{
+    height: auto;
+    max-height: 600px;
+    min-height: 200px;
+}
+#od-benchmark-tab-table-button{
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline{
+    border: 1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_next{
+    border: 0.1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_gray{
+    border: 0.5px solid gray;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: italic;
+}
+.btn_boderline_selected{
+    border: 2px solid purple;
+    background-color: #f2f2f2;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.accordion-label button span{
+    font-size: 14pt;
+    font-weight: bold;
+}
+#show-task-categorized span{
+    font-size: 13pt;
+    font-weight: bold;
+}
+#show-open-source-models span{
+    font-size: 13pt;
+    font-weight: bold;
+}
+#select-models span{
+    font-size: 10pt;
+}
+#select-tasks span{
+    font-size: 10pt;
+}
+.markdown-text-details{
+    margin: 10px;
+    padding: 10px;
+}
+button.selected[role="tab"][aria-selected="true"] {
+    font-size: 18px; /* or any other size you prefer */
+    font-weight: bold;
+}
+#od-benchmark-tab-table-ablation-button {
+    font-size: larger; /* Adjust the font size as needed */
+}
+.plotly-plot{
+    height: auto;
+    max-height: 600px;
+    min-height: 600px;
+}
+#length-margin-radio{
+    font-size: 10pt;
+    padding: 0px;
+    margin: 0px;
+}
+#show-task-categorized{
+    font-size: 12pt;
+    font-decoration: bold;
+}
+#show-open-source-models{
+    font-size: 12pt;
+    font-decoration: bold;
+}
+"""

test.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## TEST