from pathlib import Path
from collections import OrderedDict
DEFAULT_K = "∞"
# DEFAULT_K = "1500"
banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
BANNER = f'
'
TITLE = " 🦁 AI2 WildBench Leaderboard "
WINRATE_HEATMAP = ""
CITATION_TEXT = """@misc{lin2024wildbench,
title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi},
year={2024},
eprint={2406.04770},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2406.04770}
}
"""
# make column_names as an ordered dict
column_names = OrderedDict({
"Model": "Model",
"Mode": "Mode",
"Puzzle Acc": "Puzzle Acc",
"Cell Acc": "Cell Acc",
"No answer": "No answer",
"Easy Puzzle Acc": "Easy Puzzle Acc",
"Hard Puzzle Acc": "Hard Puzzle Acc",
# "Total Puzzles": "Total Puzzles",
# "Reason Lens": "Reason Lens",
})
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
"""
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
# **WB Score** individually scores each model based on checklists.
# Evaluator is GPT-4-Turbo.
LEADERBOARD_REMARKS_MAIN = """
"""
RANKING_COLUMN = "Puzzle Acc"
ORDERED_COLUMN_NAMES = [
"Model",
"Mode",
"Puzzle Acc",
"Easy Puzzle Acc",
"Hard Puzzle Acc",
"Cell Acc",
"No answer",
]
js_light = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
// Find the fieldset with the given id
const fieldset = document.getElementById("rank-column-radio");
// Create a new span element with the text "Rank by:"
const rankBySpan = document.createElement("span");
rankBySpan.textContent = "Rank by: ";
rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
rankBySpan.style.fontSize = "19px"; // Larger font size
rankBySpan.style.paddingRight = "18px"; // Add padding on the right
// Wrap the span and the labels in a flex container
const flexContainer = document.createElement("div");
flexContainer.style.display = "flex";
flexContainer.style.alignItems = "center";
// Insert the rankBySpan at the beginning of the flex container
flexContainer.appendChild(rankBySpan);
// Move all existing labels into the flex container
while (fieldset.firstChild) {
flexContainer.appendChild(fieldset.firstChild);
}
// Append the flex container back to the fieldset
fieldset.appendChild(flexContainer);
}
"""
js_code = """
function scroll_top() {
console.log("Hello from Gradio!");
const bubbles = document.querySelectorAll('.bubble-wrap');
bubbles.forEach((bubble, index) => {
setTimeout(() => {
bubble.scrollTop = 0;
}, index * 100); // Delay of 100ms between each iteration
});
}
"""
TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
css = """
code {
font-size: large;
}
footer {visibility: hidden}
.top-left-LP{
margin-top: 6px;
margin-left: 5px;
}
.no_margin{
margin-top: 0px;
margin-left: 0px;
margin-right: 0px;
margin-bottom: 0px;
padding-top: 0px;
padding-left: 0px;
padding-right: 0px;
padding-bottom: 0px;
}
.markdown-text{font-size: 14pt}
.markdown-text-tiny{font-size: 10pt}
.markdown-text-small{font-size: 13pt}
.markdown-text-tiny{font-size: 12pt}
.markdown-text-tiny-red{
font-size: 12pt;
color: red;
background-color: yellow;
font-color: red;
font-weight: bold;
}
th {
text-align: center;
font-size: 17px; /* Adjust the font size as needed */
}
td {
font-size: 15px; /* Adjust the font size as needed */
text-align: center;
}
.sample_button{
border: 2px solid #000000;
border-radius: 10px;
padding: 10px;
font-size: 17pt;
font-weight: bold;
margin: 5px;
background-color: #D8BFD8;
}
.chat-common{
height: auto;
max-height: 400px;
min-height: 100px;
}
.chat-specific{
height: auto;
max-height: 600px;
min-height: 200px;
}
#od-benchmark-tab-table-button{
font-size: 15pt;
font-weight: bold;
}
.btn_boderline{
border: 1px solid #000000;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.btn_boderline_next{
border: 0.1px solid #000000;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.btn_boderline_gray{
border: 0.5px solid gray;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: italic;
}
.btn_boderline_selected{
border: 2px solid purple;
background-color: #f2f2f2;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.accordion-label button span{
font-size: 14pt;
font-weight: bold;
}
#show-task-categorized span{
font-size: 13pt;
font-weight: bold;
}
#show-open-source-models span{
font-size: 13pt;
font-weight: bold;
}
#select-models span{
font-size: 10pt;
}
#select-tasks span{
font-size: 10pt;
}
.markdown-text-details{
margin: 10px;
padding: 10px;
}
button.selected[role="tab"][aria-selected="true"] {
font-size: 18px; /* or any other size you prefer */
font-weight: bold;
}
#od-benchmark-tab-table-ablation-button {
font-size: larger; /* Adjust the font size as needed */
}
.plotly-plot{
height: auto;
max-height: 600px;
min-height: 600px;
}
#length-margin-radio{
font-size: 10pt;
# padding: 0px;
# margin: 1px;
}
#show-task-categorized{
font-size: 12pt;
font-decoration: bold;
}
#show-open-source-models{
font-size: 12pt;
font-decoration: bold;
}
"""