from pathlib import Path from collections import OrderedDict DEFAULT_K = "∞" # DEFAULT_K = "1500" banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here. BANNER = f'
Banner
' TITLE = "

🦁 AI2 WildBench Leaderboard " WINRATE_HEATMAP = "
" CITATION_TEXT = """@misc{lin2024wildbench, title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild}, author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi}, year={2024}, eprint={2406.04770}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2406.04770} } """ # make column_names as an ordered dict column_names = OrderedDict({ "Model": "Model", "Mode": "Mode", "Puzzle Acc": "Puzzle Acc", "Cell Acc": "Cell Acc", "No answer": "No answer", "Easy Puzzle Acc": "Easy Puzzle Acc", "Hard Puzzle Acc": "Hard Puzzle Acc", # "Total Puzzles": "Total Puzzles", # "Reason Lens": "Reason Lens", }) LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**. """ # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**. # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three. # **WB Score** individually scores each model based on checklists. # Evaluator is GPT-4-Turbo. LEADERBOARD_REMARKS_MAIN = """ """ RANKING_COLUMN = "Puzzle Acc" ORDERED_COLUMN_NAMES = [ "Model", "Mode", "Puzzle Acc", "Easy Puzzle Acc", "Hard Puzzle Acc", "Cell Acc", "No answer", ] js_light = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'light') { url.searchParams.set('__theme', 'light'); window.location.href = url.href; } // Find the fieldset with the given id const fieldset = document.getElementById("rank-column-radio"); // Create a new span element with the text "Rank by:" const rankBySpan = document.createElement("span"); rankBySpan.textContent = "Rank by: "; rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold rankBySpan.style.fontSize = "19px"; // Larger font size rankBySpan.style.paddingRight = "18px"; // Add padding on the right // Wrap the span and the labels in a flex container const flexContainer = document.createElement("div"); flexContainer.style.display = "flex"; flexContainer.style.alignItems = "center"; // Insert the rankBySpan at the beginning of the flex container flexContainer.appendChild(rankBySpan); // Move all existing labels into the flex container while (fieldset.firstChild) { flexContainer.appendChild(fieldset.firstChild); } // Append the flex container back to the fieldset fieldset.appendChild(flexContainer); } """ js_code = """ function scroll_top() { console.log("Hello from Gradio!"); const bubbles = document.querySelectorAll('.bubble-wrap'); bubbles.forEach((bubble, index) => { setTimeout(() => { bubble.scrollTop = 0; }, index * 100); // Delay of 100ms between each iteration }); } """ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)" css = """ code { font-size: large; } footer {visibility: hidden} .top-left-LP{ margin-top: 6px; margin-left: 5px; } .no_margin{ margin-top: 0px; margin-left: 0px; margin-right: 0px; margin-bottom: 0px; padding-top: 0px; padding-left: 0px; padding-right: 0px; padding-bottom: 0px; } .markdown-text{font-size: 14pt} .markdown-text-tiny{font-size: 10pt} .markdown-text-small{font-size: 13pt} .markdown-text-tiny{font-size: 12pt} .markdown-text-tiny-red{ font-size: 12pt; color: red; background-color: yellow; font-color: red; font-weight: bold; } th { text-align: center; font-size: 17px; /* Adjust the font size as needed */ } td { font-size: 15px; /* Adjust the font size as needed */ text-align: center; } .sample_button{ border: 2px solid #000000; border-radius: 10px; padding: 10px; font-size: 17pt; font-weight: bold; margin: 5px; background-color: #D8BFD8; } .chat-common{ height: auto; max-height: 400px; min-height: 100px; } .chat-specific{ height: auto; max-height: 600px; min-height: 200px; } #od-benchmark-tab-table-button{ font-size: 15pt; font-weight: bold; } .btn_boderline{ border: 1px solid #000000; border-radius: 5px; padding: 5px; margin: 5px; font-size: 15pt; font-weight: bold; } .btn_boderline_next{ border: 0.1px solid #000000; border-radius: 5px; padding: 5px; margin: 5px; font-size: 15pt; font-weight: bold; } .btn_boderline_gray{ border: 0.5px solid gray; border-radius: 5px; padding: 5px; margin: 5px; font-size: 15pt; font-weight: italic; } .btn_boderline_selected{ border: 2px solid purple; background-color: #f2f2f2; border-radius: 5px; padding: 5px; margin: 5px; font-size: 15pt; font-weight: bold; } .accordion-label button span{ font-size: 14pt; font-weight: bold; } #show-task-categorized span{ font-size: 13pt; font-weight: bold; } #show-open-source-models span{ font-size: 13pt; font-weight: bold; } #select-models span{ font-size: 10pt; } #select-tasks span{ font-size: 10pt; } .markdown-text-details{ margin: 10px; padding: 10px; } button.selected[role="tab"][aria-selected="true"] { font-size: 18px; /* or any other size you prefer */ font-weight: bold; } #od-benchmark-tab-table-ablation-button { font-size: larger; /* Adjust the font size as needed */ } .plotly-plot{ height: auto; max-height: 600px; min-height: 600px; } #length-margin-radio{ font-size: 10pt; # padding: 0px; # margin: 1px; } #show-task-categorized{ font-size: 12pt; font-decoration: bold; } #show-open-source-models{ font-size: 12pt; font-decoration: bold; } """