import os import re import streamlit as st import requests import pandas as pd from io import StringIO import plotly.graph_objs as go from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError from dotenv import load_dotenv load_dotenv() SERVER_URL = os.getenv("SERVER_URL") def get_data(): response = requests.get(SERVER_URL) data = response.json() return data def main(): st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide") title_column, refresh_column = st.columns([.92, 0.08]) with title_column: st.title("Indic LLM Leaderboard (α)") st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.") with refresh_column: if st.button("Refresh", type="primary"): data = get_data() Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["🏅 Leaderboard", "📝 About" , "❗FAQ","🚀 Submit"]) with Leaderboard_tab: data = get_data() table_data = [] all_models = [] for item in data: model_name = item.get("name") language = item.get("language") try: ALL = item["result"]["all"]["acc_norm"] except KeyError: ALL = None try: ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"] except KeyError: ARC_Easy = None try: ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"] except KeyError: ARC_Challenge = None try: Hellaswag = item["result"]["Hellaswag"]["acc_norm"] except KeyError: Hellaswag = None try: Boolq = item["result"]["Boolq"]["acc_norm"] except KeyError: Boolq = None try: MMLU = item["result"]["MMLU"]["acc_norm"] except KeyError: MMLU = None try: Winograde = item["result"]["Winograde"]["acc_norm"] except KeyError: Winograde = None try: Translation = item["result"]["Translation"]["acc_norm"] except KeyError: Translation = None try: Generation = item["result"]["Generation"]["acc_norm"] except KeyError: Generation = None all_models.append(model_name) table_data.append({ "Model Name": model_name, "Language": language, "Avergae": ALL, "ARC-Easy": ARC_Easy, "ARC-Challenge": ARC_Challenge, "Hellaswag": Hellaswag, "Boolq": Boolq, "MMLU": MMLU, "Winograde": Winograde, "Translation": Translation, "Generation": Generation }) df = pd.DataFrame(table_data) title = st.text_input('Model Name', placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...") col1, col2 = st.columns(2) with col1: benchmark_options = st.multiselect( 'Pick Benchmark', ['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU']) with col2: language_options = st.multiselect( 'Pick Languages', ['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam']) if title: if ';' in title: model_names = [name.strip() for name in title.split(';')] filtered_df = df[df['Model Name'].isin(model_names)] else: filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)] filtered_df = filtered_df[filtered_df['Language'].isin(language_options)] filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)] # Calculate average across selected benchmark columns filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1) # Display the filtered DataFrame st.dataframe(filtered_df, use_container_width=True) elif benchmark_options or language_options: filtered_df = df[df['Language'].isin(language_options)] filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)] # Calculate average across selected benchmark columns filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1) st.dataframe(filtered_df, use_container_width=True) # Multiselect for comparing models compare_models = st.multiselect( 'Pick Models to compare them', df['Model Name'].unique() ) # Display DataFrame for selected models and their scores if compare_models: compare_data = [] for model in compare_models: model_data = df[df['Model Name'] == model] compare_data.append(model_data) if compare_data: compare_df = pd.concat(compare_data) compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average st.dataframe(compare_df, use_container_width=True) # About tab with About_tab: st.markdown(''' ### About Indic LLM Leaderboard ### Indic Eval ### Contribute ''') # FAQ tab with FAQ_tab: st.markdown(''' ### FAQ ### SUBMISSIONS ### RESULTS ### EDITING SUBMISSIONS ### OTHER ''') # Submit tab with Submit_tab: st.markdown(''' ### Submit Your Model ''') with st.expander(label="📙 Citation"): code = ''' @misc{indic-llm-leaderboard, author = {Adithya S Kolavi}, title = {Indic LLM Leaderboard}, year = {2024}, publisher = {Cognitivelab}, howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}", } ''' st.code(code, language='python') if __name__ == "__main__": main()