AdithyaSK's picture
feat(app): update app functionality and connect to server - Adithya S K
0822dde
raw
history blame
7.16 kB
import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv
load_dotenv()
SERVER_URL = os.getenv("SERVER_URL")
def get_data():
response = requests.get(SERVER_URL)
data = response.json()
return data
def main():
st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
title_column, refresh_column = st.columns([.92, 0.08])
with title_column:
st.title("Indic LLM Leaderboard (Ξ±)")
st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
with refresh_column:
if st.button("Refresh", type="primary"):
data = get_data()
Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["πŸ… Leaderboard", "πŸ“ About" , "❗FAQ","πŸš€ Submit"])
with Leaderboard_tab:
data = get_data()
table_data = []
all_models = []
for item in data:
model_name = item.get("name")
language = item.get("language")
try:
ALL = item["result"]["all"]["acc_norm"]
except KeyError:
ALL = None
try:
ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
except KeyError:
ARC_Easy = None
try:
ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
except KeyError:
ARC_Challenge = None
try:
Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
except KeyError:
Hellaswag = None
try:
Boolq = item["result"]["Boolq"]["acc_norm"]
except KeyError:
Boolq = None
try:
MMLU = item["result"]["MMLU"]["acc_norm"]
except KeyError:
MMLU = None
try:
Winograde = item["result"]["Winograde"]["acc_norm"]
except KeyError:
Winograde = None
try:
Translation = item["result"]["Translation"]["acc_norm"]
except KeyError:
Translation = None
try:
Generation = item["result"]["Generation"]["acc_norm"]
except KeyError:
Generation = None
all_models.append(model_name)
table_data.append({
"Model Name": model_name,
"Language": language,
"Avergae": ALL,
"ARC-Easy": ARC_Easy,
"ARC-Challenge": ARC_Challenge,
"Hellaswag": Hellaswag,
"Boolq": Boolq,
"MMLU": MMLU,
"Winograde": Winograde,
"Translation": Translation,
"Generation": Generation
})
df = pd.DataFrame(table_data)
title = st.text_input('Model Name', placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...")
col1, col2 = st.columns(2)
with col1:
benchmark_options = st.multiselect(
'Pick Benchmark',
['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU'])
with col2:
language_options = st.multiselect(
'Pick Languages',
['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'])
if title:
if ';' in title:
model_names = [name.strip() for name in title.split(';')]
filtered_df = df[df['Model Name'].isin(model_names)]
else:
filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)]
filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]
# Calculate average across selected benchmark columns
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
# Display the filtered DataFrame
st.dataframe(filtered_df, use_container_width=True)
elif benchmark_options or language_options:
filtered_df = df[df['Language'].isin(language_options)]
filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]
# Calculate average across selected benchmark columns
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
st.dataframe(filtered_df, use_container_width=True)
# Multiselect for comparing models
compare_models = st.multiselect(
'Pick Models to compare them',
df['Model Name'].unique()
)
# Display DataFrame for selected models and their scores
if compare_models:
compare_data = []
for model in compare_models:
model_data = df[df['Model Name'] == model]
compare_data.append(model_data)
if compare_data:
compare_df = pd.concat(compare_data)
compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
st.dataframe(compare_df, use_container_width=True)
# About tab
with About_tab:
st.markdown('''
### About Indic LLM Leaderboard
### Indic Eval
### Contribute
''')
# FAQ tab
with FAQ_tab:
st.markdown('''
### FAQ
### SUBMISSIONS
### RESULTS
### EDITING SUBMISSIONS
### OTHER
''')
# Submit tab
with Submit_tab:
st.markdown('''
### Submit Your Model
''')
with st.expander(label="πŸ“™ Citation"):
code = '''
@misc{indic-llm-leaderboard,
author = {Adithya S Kolavi},
title = {Indic LLM Leaderboard},
year = {2024},
publisher = {Cognitivelab},
howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}",
}
'''
st.code(code, language='python')
if __name__ == "__main__":
main()