sheza munir
commited on
Set up leaderboard page
Browse files- app.py +200 -0
- factEvalSteps.png +0 -0
- factbench_data.csv +13 -0
- requirements.py +3 -0
app.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
# Set up page config
|
6 |
+
st.set_page_config(
|
7 |
+
page_title="FactBench Leaderboard",
|
8 |
+
# layout="wide", # Layout remains wide, but content will be centered
|
9 |
+
)
|
10 |
+
|
11 |
+
# Load the image
|
12 |
+
image = Image.open("factEvalSteps.png")
|
13 |
+
|
14 |
+
# Custom CSS for the page
|
15 |
+
st.markdown(
|
16 |
+
"""
|
17 |
+
<style>
|
18 |
+
@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
|
19 |
+
|
20 |
+
html, body, [class*="css"] {
|
21 |
+
font-family: 'Courier Prime', monospace;
|
22 |
+
}
|
23 |
+
|
24 |
+
.title {
|
25 |
+
font-size: 42px;
|
26 |
+
font-weight: bold;
|
27 |
+
text-align: center;
|
28 |
+
color: #333;
|
29 |
+
margin-bottom: 5px;
|
30 |
+
}
|
31 |
+
|
32 |
+
.description {
|
33 |
+
font-size: 22px;
|
34 |
+
text-align: center;
|
35 |
+
margin-bottom: 30px;
|
36 |
+
color: #555;
|
37 |
+
}
|
38 |
+
|
39 |
+
.container {
|
40 |
+
max-width: 1000px; /* Set a max-width for the container */
|
41 |
+
margin: 0 auto; /* Center the container */
|
42 |
+
padding: 20px;
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
table {
|
48 |
+
width: 100%;
|
49 |
+
border-collapse: collapse;
|
50 |
+
border-radius: 10px;
|
51 |
+
overflow: hidden;
|
52 |
+
}
|
53 |
+
|
54 |
+
th, td {
|
55 |
+
padding: 8px;
|
56 |
+
text-align: center;
|
57 |
+
border: 1px solid #ddd;
|
58 |
+
font-size: 14px;
|
59 |
+
transition: background-color 0.3s;
|
60 |
+
}
|
61 |
+
|
62 |
+
th {
|
63 |
+
background-color: #f2f2f2;
|
64 |
+
font-weight: bold;
|
65 |
+
}
|
66 |
+
|
67 |
+
td:hover {
|
68 |
+
background-color: #eaeaea;
|
69 |
+
}
|
70 |
+
</style>
|
71 |
+
""",
|
72 |
+
unsafe_allow_html=True
|
73 |
+
)
|
74 |
+
|
75 |
+
# Display title and description
|
76 |
+
st.markdown('<div class="container">', unsafe_allow_html=True)
|
77 |
+
st.markdown('<div class="title">FactBench</div>',
|
78 |
+
unsafe_allow_html=True)
|
79 |
+
st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
|
80 |
+
unsafe_allow_html=True)
|
81 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
82 |
+
|
83 |
+
# Load the data
|
84 |
+
data_path = "factbench_data.csv"
|
85 |
+
df = pd.read_csv(data_path)
|
86 |
+
|
87 |
+
# Create tabs
|
88 |
+
tab1, tab2, tab3 = st.tabs(
|
89 |
+
["Leaderboard", "Benchmark Details", "Submit your models"])
|
90 |
+
|
91 |
+
# Tab 1: Leaderboard
|
92 |
+
with tab1:
|
93 |
+
st.markdown('<div class="title">Leaderboard</div>',
|
94 |
+
unsafe_allow_html=True)
|
95 |
+
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
96 |
+
|
97 |
+
# Dropdown menu to filter tiers
|
98 |
+
tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
|
99 |
+
selected_tier = st.selectbox('Select Tier:', tiers)
|
100 |
+
|
101 |
+
# Filter the data based on the selected tier
|
102 |
+
if selected_tier != 'All Tiers':
|
103 |
+
filtered_df = df[df['Tier'] == selected_tier]
|
104 |
+
else:
|
105 |
+
filtered_df = df
|
106 |
+
|
107 |
+
# Create HTML for the table
|
108 |
+
html = '''
|
109 |
+
<table>
|
110 |
+
<thead>
|
111 |
+
<tr>
|
112 |
+
<th>Tier</th>
|
113 |
+
<th>Model</th>
|
114 |
+
<th>FactScore</th>
|
115 |
+
<th>SAFE</th>
|
116 |
+
<th>Factcheck-GPT</th>
|
117 |
+
<th>VERIFY</th>
|
118 |
+
</tr>
|
119 |
+
</thead>
|
120 |
+
<tbody>
|
121 |
+
'''
|
122 |
+
|
123 |
+
# Generate the rows of the table
|
124 |
+
current_tier = None
|
125 |
+
for i, row in filtered_df.iterrows():
|
126 |
+
if row['Tier'] != current_tier:
|
127 |
+
if current_tier is not None:
|
128 |
+
# Close the previous tier row
|
129 |
+
html += ' </tr>'
|
130 |
+
current_tier = row['Tier']
|
131 |
+
html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
|
132 |
+
else:
|
133 |
+
html += ' <tr>'
|
134 |
+
|
135 |
+
# Fill in model and scores
|
136 |
+
html += f'''
|
137 |
+
<td>{row['Model']}</td>
|
138 |
+
<td>{row['FactScore']:.2f}</td>
|
139 |
+
<td>{row['SAFE']:.2f}</td>
|
140 |
+
<td>{row['Factcheck-GPT']:.2f}</td>
|
141 |
+
<td>{row['VERIFY']:.2f}</td>
|
142 |
+
</tr>
|
143 |
+
'''
|
144 |
+
|
145 |
+
# Close the last row and table tags
|
146 |
+
html += '''
|
147 |
+
</table>
|
148 |
+
'''
|
149 |
+
|
150 |
+
# Display the table
|
151 |
+
st.markdown(html, unsafe_allow_html=True)
|
152 |
+
|
153 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
154 |
+
|
155 |
+
# Tab 2: Details
|
156 |
+
with tab2:
|
157 |
+
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
158 |
+
|
159 |
+
st.markdown('<div class="title">Benchmark Details</div>',
|
160 |
+
unsafe_allow_html=True)
|
161 |
+
st.image(image, use_column_width=True)
|
162 |
+
|
163 |
+
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
|
164 |
+
st.write(
|
165 |
+
"Language models (LMs) are widely used by an increasing number of users, "
|
166 |
+
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
|
167 |
+
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
|
168 |
+
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
|
169 |
+
)
|
170 |
+
|
171 |
+
st.markdown('### Content Categorization')
|
172 |
+
st.write(
|
173 |
+
"VERIFY considers the verifiability of LM-generated content and categorizes content units as "
|
174 |
+
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
|
175 |
+
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
|
176 |
+
)
|
177 |
+
|
178 |
+
st.markdown('### Hallucination Prompts & FactBench Dataset')
|
179 |
+
st.write(
|
180 |
+
"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
|
181 |
+
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
|
182 |
+
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
|
183 |
+
"regularly updated with new prompts."
|
184 |
+
)
|
185 |
+
|
186 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
187 |
+
|
188 |
+
# Tab 3: Links
|
189 |
+
with tab3:
|
190 |
+
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
191 |
+
|
192 |
+
st.markdown('<div class="title">Submit your model information on our Github</div>',
|
193 |
+
unsafe_allow_html=True)
|
194 |
+
|
195 |
+
st.markdown(
|
196 |
+
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
|
197 |
+
st.markdown(
|
198 |
+
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
|
199 |
+
|
200 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
factEvalSteps.png
ADDED
factbench_data.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tier,Model,FactScore,SAFE,Factcheck-GPT,VERIFY
|
2 |
+
Tier 1: Easy,GPT4-o,53.19,63.31,86.4,71.58
|
3 |
+
Tier 1: Easy,Gemini1.5-Pro,51.79,61.24,83.45,69.38
|
4 |
+
Tier 1: Easy,Llama3.1-70B-Instruct,52.49,61.29,83.48,67.27
|
5 |
+
Tier 1: Easy,Llama3.1-405B-Instruct,53.22,61.63,83.57,64.94
|
6 |
+
Tier 2: Moderate,GPT4-o,54.76,65.01,89.39,76.02
|
7 |
+
Tier 2: Moderate,Gemini1.5-Pro,52.62,62.68,87.44,74.24
|
8 |
+
Tier 2: Moderate,Llama3.1-70B-Instruct,52.53,62.64,85.16,72.01
|
9 |
+
Tier 2: Moderate,Llama3.1-405B-Instruct,53.48,63.29,86.37,70.25
|
10 |
+
Tier 3: Hard,GPT4-o,69.44,76.17,94.25,90.58
|
11 |
+
Tier 3: Hard,Gemini1.5-Pro,66.05,75.69,91.09,87.82
|
12 |
+
Tier 3: Hard,Llama3.1-70B-Instruct,69.85,77.55,92.89,86.63
|
13 |
+
Tier 3: Hard,Llama3.1-405B-Instruct,70.04,77.01,93.64,85.79
|
requirements.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit
|
3 |
+
scikit-learn == 1.0.2
|