|
import streamlit as st |
|
import pandas as pd |
|
from PIL import Image |
|
import base64 |
|
from io import BytesIO |
|
|
|
|
|
st.set_page_config( |
|
page_title="FactBench Leaderboard", |
|
layout="wide" |
|
) |
|
|
|
|
|
with open("_header.md", "r") as f: |
|
HEADER_MD = f.read() |
|
|
|
|
|
image = Image.open("factEvalSteps.png") |
|
logo_image = Image.open("Factbench_logo.png") |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap'); |
|
|
|
html, body, [class*="css"] { |
|
font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */ |
|
background-color: #f9f9f9; /* Light grey background */ |
|
} |
|
|
|
.title { |
|
font-size: 42px; |
|
font-weight: bold; |
|
text-align: center; |
|
color: #333; |
|
margin-bottom: 5px; |
|
} |
|
|
|
.description { |
|
font-size: 22px; |
|
text-align: center; |
|
margin-bottom: 30px; |
|
color: #555; |
|
} |
|
|
|
.header, .metric { |
|
align-items: left; |
|
font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */ |
|
margin-bottom: 20px; |
|
} |
|
|
|
.container { |
|
max-width: 1000px; |
|
margin: 0 auto; |
|
padding: 5px; |
|
} |
|
|
|
table { |
|
width: 100%; |
|
border-collapse: collapse; |
|
border-radius: 10px; |
|
overflow: hidden; |
|
} |
|
|
|
th, td { |
|
padding: 8px; |
|
text-align: center; |
|
border: 1px solid #ddd; |
|
font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */ |
|
font-size: 16px; |
|
transition: background-color 0.3s; |
|
} |
|
|
|
th { |
|
background-color: #f2f2f2; |
|
font-weight: bold; |
|
} |
|
|
|
td:hover { |
|
background-color: #eaeaea; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
st.markdown('<div class="container">', unsafe_allow_html=True) |
|
|
|
|
|
|
|
buffered = BytesIO() |
|
logo_image.save(buffered, format="PNG") |
|
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
st.markdown( |
|
f""" |
|
<style> |
|
.logo-container {{ |
|
display: flex; |
|
justify-content: flex-start; /* Aligns to the left */ |
|
}} |
|
.logo-container img {{ |
|
width: 50%; /* Adjust this to control the width, e.g., 50% of container width */ |
|
margin: 0 auto; |
|
max-width: 700px; /* Set a maximum width */ |
|
background-color: transparent; |
|
}} |
|
</style> |
|
<div class="logo-container"> |
|
<img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo"> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
|
|
|
|
st.markdown( |
|
''' |
|
<div class="header"> |
|
<br/> |
|
<p style="font-size:22px;"> |
|
π FactBench: A Dynamic Benchmark for In-the-Wild Language Model Factuality Evaluation |
|
</p> |
|
<p style="font-size:20px;"> |
|
π <a href="https://arxiv.org/abs/2410.22257v1">Paper</a> | π» <a href="https://github.com/launchnlp/FactBench">GitHub</a> | π€ <a href="https://huggingface.co./datasets/launch/FactBench">HuggingFace</a> | π¦ <a href="https://x.com/FarimaFB/status/1851752079318261933">X</a> | π¬ <a href="https://huggingface.co./spaces/launch/factbench/discussions">Discussion</a> | |
|
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 7 | Updated: <strong>10/26/2024</strong> |
|
</p> |
|
</div> |
|
''', |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
|
|
|
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
data_path = "tiered_models_data.csv" |
|
df = pd.read_csv(data_path) |
|
|
|
|
|
df['rank'] = df.groupby('tier')['factuality_score'].rank( |
|
ascending=False, method='min').astype(int) |
|
|
|
|
|
df.fillna('-', inplace=True) |
|
|
|
df['original_order'] = df.groupby('tier').cumcount() |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p { |
|
font-size: 20px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
tab1, tab2, tab3 = st.tabs(["Leaderboard", "Benchmark Details", "Submit your models"]) |
|
|
|
|
|
with tab1: |
|
|
|
|
|
|
|
|
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
st.markdown(""" |
|
<div class="metric" style="font-size:20px; font-weight: bold;"> |
|
Metrics Explanation |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
st.markdown(""" |
|
<div class="metric" style="font-size:16px;"> |
|
<br/> |
|
<p> |
|
<strong> π― Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> π Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>). |
|
</p> |
|
<p> |
|
π for closed LLMs; π for open-weights LLMs; π¨ for newly added models |
|
</p> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
st.markdown(""" |
|
<style> |
|
/* Selectbox text */ |
|
div[data-baseweb="select"] > div { |
|
font-size: 20px; |
|
} |
|
|
|
/* Dropdown options */ |
|
div[role="listbox"] ul li { |
|
font-size: 20px !important; |
|
} |
|
|
|
/* Checkbox label */ |
|
.stCheckbox label p { |
|
font-size: 20px !important; |
|
} |
|
|
|
/* Selectbox label */ |
|
.stSelectbox label p { |
|
font-size: 20px !important; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy'] |
|
selected_tier = st.selectbox('Select Tier:', tiers) |
|
|
|
|
|
if selected_tier != 'All Tiers': |
|
filtered_df = df[df['tier'] == selected_tier] |
|
else: |
|
filtered_df = df |
|
|
|
sort_by_factuality = st.checkbox('Sort by Factual Precision') |
|
|
|
|
|
if sort_by_factuality: |
|
updated_filtered_df = filtered_df.sort_values( |
|
by=['tier', 'factuality_score'], ascending=[True, False] |
|
) |
|
else: |
|
updated_filtered_df = filtered_df.sort_values( |
|
by=['tier', 'original_order'] |
|
) |
|
|
|
|
|
if selected_tier == 'All Tiers': |
|
html = ''' |
|
<table> |
|
<thead> |
|
<tr> |
|
<th>Tier</th> |
|
<th>Rank</th> |
|
<th>Model</th> |
|
<th>π― Factual Precision</th> |
|
<th>π Hallucination Score</th> |
|
<th>Avg. # Tokens</th> |
|
<th>Avg. # Units</th> |
|
<th>Avg. # Undecidable</th> |
|
<th>Avg. # Unsupported</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
''' |
|
else: |
|
html = ''' |
|
<table> |
|
<thead> |
|
<tr> |
|
<th>Rank</th> |
|
<th>Model</th> |
|
<th>π― Factual Precision</th> |
|
<th>π Hallucination Score</th> |
|
<th>Avg. # Tokens</th> |
|
<th>Avg. # Units</th> |
|
<th>Avg. # Undecidable</th> |
|
<th>Avg. # Unsupported</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
''' |
|
|
|
|
|
current_tier = None |
|
for i, row in updated_filtered_df.iterrows(): |
|
html += '<tr>' |
|
|
|
|
|
if selected_tier == 'All Tiers': |
|
if row['tier'] != current_tier: |
|
current_tier = row['tier'] |
|
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>' |
|
|
|
|
|
html += f''' |
|
<td>{row['rank']}</td> |
|
<td>{row['model']}</td> |
|
<td>{row['factuality_score']}</td> |
|
<td>{row['hallucination_score']}</td> |
|
<td>{row['avg_tokens']}</td> |
|
<td>{row['avg_factual_units']}</td> |
|
<td>{row['avg_undecidable_units']:.2f}</td> |
|
<td>{row['avg_unsupported_units']:.2f}</td> |
|
</tr> |
|
''' |
|
|
|
|
|
html += ''' |
|
</table> |
|
''' |
|
|
|
|
|
st.markdown(html, unsafe_allow_html=True) |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
with tab2: |
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
|
|
|
|
st.image(image, use_column_width=True) |
|
|
|
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') |
|
st.write( |
|
"Language models (LMs) are widely used by an increasing number of users, " |
|
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. " |
|
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " |
|
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions." |
|
) |
|
|
|
st.markdown('### Content Categorization') |
|
st.write( |
|
"VERIFY considers the verifiability of LM-generated content and categorizes content units as " |
|
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " |
|
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." |
|
) |
|
|
|
st.markdown('### Hallucination Prompts & FactBench Dataset') |
|
st.write( |
|
"Using VERIFY, we identify 'hallucination prompts' across diverse topicsβthose eliciting the highest rates of " |
|
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " |
|
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " |
|
"regularly updated with new prompts." |
|
) |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
with tab3: |
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
st.markdown('<div class="title">Submit your model information on our Github</div>', |
|
unsafe_allow_html=True) |
|
|
|
st.markdown( |
|
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') |
|
st.markdown( |
|
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|