|
import os |
|
import re |
|
import streamlit as st |
|
import requests |
|
import pandas as pd |
|
from io import StringIO |
|
import plotly.graph_objs as go |
|
from huggingface_hub import HfApi |
|
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
SERVER_URL = os.getenv("SERVER_URL") |
|
|
|
def get_data(): |
|
response = requests.get(SERVER_URL) |
|
data = response.json() |
|
return data |
|
|
|
def main(): |
|
|
|
st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide") |
|
|
|
title_column, refresh_column = st.columns([.92, 0.08]) |
|
with title_column: |
|
st.title("Indic LLM Leaderboard (Ξ±)") |
|
st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.") |
|
with refresh_column: |
|
if st.button("Refresh", type="primary"): |
|
data = get_data() |
|
|
|
Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["π
Leaderboard", "π About" , "βFAQ","π Submit"]) |
|
|
|
with Leaderboard_tab: |
|
data = get_data() |
|
|
|
table_data = [] |
|
all_models = [] |
|
|
|
for item in data: |
|
model_name = item.get("name") |
|
language = item.get("language") |
|
try: |
|
ALL = item["result"]["all"]["acc_norm"] |
|
except KeyError: |
|
ALL = None |
|
try: |
|
ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"] |
|
except KeyError: |
|
ARC_Easy = None |
|
try: |
|
ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"] |
|
except KeyError: |
|
ARC_Challenge = None |
|
try: |
|
Hellaswag = item["result"]["Hellaswag"]["acc_norm"] |
|
except KeyError: |
|
Hellaswag = None |
|
try: |
|
Boolq = item["result"]["Boolq"]["acc_norm"] |
|
except KeyError: |
|
Boolq = None |
|
try: |
|
MMLU = item["result"]["MMLU"]["acc_norm"] |
|
except KeyError: |
|
MMLU = None |
|
try: |
|
Winograde = item["result"]["Winograde"]["acc_norm"] |
|
except KeyError: |
|
Winograde = None |
|
try: |
|
Translation = item["result"]["Translation"]["acc_norm"] |
|
except KeyError: |
|
Translation = None |
|
try: |
|
Generation = item["result"]["Generation"]["acc_norm"] |
|
except KeyError: |
|
Generation = None |
|
|
|
all_models.append(model_name) |
|
table_data.append({ |
|
"Model Name": model_name, |
|
"Language": language, |
|
"Avergae": ALL, |
|
"ARC-Easy": ARC_Easy, |
|
"ARC-Challenge": ARC_Challenge, |
|
"Hellaswag": Hellaswag, |
|
"Boolq": Boolq, |
|
"MMLU": MMLU, |
|
"Winograde": Winograde, |
|
"Translation": Translation, |
|
"Generation": Generation |
|
}) |
|
|
|
df = pd.DataFrame(table_data) |
|
|
|
title = st.text_input('Model Name', placeholder=" π Search for your model (separate multiple queries with `;`) and press ENTER...") |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
benchmark_options = st.multiselect( |
|
'Pick Benchmark', |
|
['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU']) |
|
with col2: |
|
language_options = st.multiselect( |
|
'Pick Languages', |
|
['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam']) |
|
|
|
if title: |
|
if ';' in title: |
|
model_names = [name.strip() for name in title.split(';')] |
|
filtered_df = df[df['Model Name'].isin(model_names)] |
|
else: |
|
filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)] |
|
|
|
filtered_df = filtered_df[filtered_df['Language'].isin(language_options)] |
|
filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)] |
|
|
|
|
|
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1) |
|
|
|
|
|
st.dataframe(filtered_df, use_container_width=True) |
|
elif benchmark_options or language_options: |
|
filtered_df = df[df['Language'].isin(language_options)] |
|
filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)] |
|
|
|
|
|
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1) |
|
|
|
st.dataframe(filtered_df, use_container_width=True) |
|
|
|
|
|
compare_models = st.multiselect( |
|
'Pick Models to compare them', |
|
df['Model Name'].unique() |
|
) |
|
|
|
|
|
if compare_models: |
|
compare_data = [] |
|
for model in compare_models: |
|
model_data = df[df['Model Name'] == model] |
|
compare_data.append(model_data) |
|
if compare_data: |
|
compare_df = pd.concat(compare_data) |
|
compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) |
|
st.dataframe(compare_df, use_container_width=True) |
|
|
|
|
|
|
|
|
|
with About_tab: |
|
st.markdown(''' |
|
### About Indic LLM Leaderboard |
|
|
|
### Indic Eval |
|
|
|
### Contribute |
|
''') |
|
|
|
|
|
with FAQ_tab: |
|
st.markdown(''' |
|
### FAQ |
|
|
|
### SUBMISSIONS |
|
|
|
|
|
### RESULTS |
|
|
|
|
|
### EDITING SUBMISSIONS |
|
|
|
|
|
### OTHER |
|
''') |
|
|
|
|
|
with Submit_tab: |
|
st.markdown(''' |
|
### Submit Your Model |
|
''') |
|
|
|
|
|
with st.expander(label="π Citation"): |
|
code = ''' |
|
@misc{indic-llm-leaderboard, |
|
author = {Adithya S Kolavi}, |
|
title = {Indic LLM Leaderboard}, |
|
year = {2024}, |
|
publisher = {Cognitivelab}, |
|
howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}", |
|
} |
|
''' |
|
st.code(code, language='python') |
|
|
|
if __name__ == "__main__": |
|
main() |