Spaces:

Cognitive-Lab
/

indic_llm_leaderboard

Running

App Files Files Community

indic_llm_leaderboard / app.py

AdithyaSK

feat(app): update app functionality and connect to server - Adithya S K

0822dde 8 months ago

raw

history blame

7.16 kB

	import os
	import re
	import streamlit as st
	import requests
	import pandas as pd
	from io import StringIO
	import plotly.graph_objs as go
	from huggingface_hub import HfApi
	from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
	from dotenv import load_dotenv

	load_dotenv()

	SERVER_URL = os.getenv("SERVER_URL")

	def get_data():
	response = requests.get(SERVER_URL)
	data = response.json()
	return data

	def main():

	st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")

	title_column, refresh_column = st.columns([.92, 0.08])
	with title_column:
	st.title("Indic LLM Leaderboard (α)")
	st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
	with refresh_column:
	if st.button("Refresh", type="primary"):
	data = get_data()

	Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["🏅 Leaderboard", "📝 About" , "❗FAQ","🚀 Submit"])

	with Leaderboard_tab:
	data = get_data()

	table_data = []
	all_models = []

	for item in data:
	model_name = item.get("name")
	language = item.get("language")
	try:
	ALL = item["result"]["all"]["acc_norm"]
	except KeyError:
	ALL = None
	try:
	ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
	except KeyError:
	ARC_Easy = None
	try:
	ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
	except KeyError:
	ARC_Challenge = None
	try:
	Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
	except KeyError:
	Hellaswag = None
	try:
	Boolq = item["result"]["Boolq"]["acc_norm"]
	except KeyError:
	Boolq = None
	try:
	MMLU = item["result"]["MMLU"]["acc_norm"]
	except KeyError:
	MMLU = None
	try:
	Winograde = item["result"]["Winograde"]["acc_norm"]
	except KeyError:
	Winograde = None
	try:
	Translation = item["result"]["Translation"]["acc_norm"]
	except KeyError:
	Translation = None
	try:
	Generation = item["result"]["Generation"]["acc_norm"]
	except KeyError:
	Generation = None

	all_models.append(model_name)
	table_data.append({
	"Model Name": model_name,
	"Language": language,
	"Avergae": ALL,
	"ARC-Easy": ARC_Easy,
	"ARC-Challenge": ARC_Challenge,
	"Hellaswag": Hellaswag,
	"Boolq": Boolq,
	"MMLU": MMLU,
	"Winograde": Winograde,
	"Translation": Translation,
	"Generation": Generation
	})

	df = pd.DataFrame(table_data)

	title = st.text_input('Model Name', placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...")

	col1, col2 = st.columns(2)
	with col1:
	benchmark_options = st.multiselect(
	'Pick Benchmark',
	['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU'])
	with col2:
	language_options = st.multiselect(
	'Pick Languages',
	['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'])

	if title:
	if ';' in title:
	model_names = [name.strip() for name in title.split(';')]
	filtered_df = df[df['Model Name'].isin(model_names)]
	else:
	filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)]

	filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
	filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

	# Calculate average across selected benchmark columns
	filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

	# Display the filtered DataFrame
	st.dataframe(filtered_df, use_container_width=True)
	elif benchmark_options or language_options:
	filtered_df = df[df['Language'].isin(language_options)]
	filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

	# Calculate average across selected benchmark columns
	filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

	st.dataframe(filtered_df, use_container_width=True)

	# Multiselect for comparing models
	compare_models = st.multiselect(
	'Pick Models to compare them',
	df['Model Name'].unique()
	)

	# Display DataFrame for selected models and their scores
	if compare_models:
	compare_data = []
	for model in compare_models:
	model_data = df[df['Model Name'] == model]
	compare_data.append(model_data)
	if compare_data:
	compare_df = pd.concat(compare_data)
	compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
	st.dataframe(compare_df, use_container_width=True)



	# About tab
	with About_tab:
	st.markdown('''
	### About Indic LLM Leaderboard

	### Indic Eval

	### Contribute
	''')

	# FAQ tab
	with FAQ_tab:
	st.markdown('''
	### FAQ

	### SUBMISSIONS


	### RESULTS


	### EDITING SUBMISSIONS


	### OTHER
	''')

	# Submit tab
	with Submit_tab:
	st.markdown('''
	### Submit Your Model
	''')


	with st.expander(label="📙 Citation"):
	code = '''
	@misc{indic-llm-leaderboard,
	author = {Adithya S Kolavi},
	title = {Indic LLM Leaderboard},
	year = {2024},
	publisher = {Cognitivelab},
	howpublished = "url{https://huggingface.co./spaces/Cognitive-Lab/indic_llm_leaderboard}",
	}
	'''
	st.code(code, language='python')

	if __name__ == "__main__":
	main()