Spaces:

open-llm-leaderboard
/

comparator

Running

App Files Files Community

comparator / src /results.py

albertvillanova HF staff

Extract concat_results function

ea4c670 verified 17 days ago

raw

history blame

4.67 kB

	import asyncio

	import gradio as gr
	import pandas as pd

	import src.constants as constants
	from src.hub import glob, load_json_file


	def fetch_result_paths():
	path = f"{constants.RESULTS_DATASET_ID}///*.json"
	return glob(path)


	def sort_result_paths_per_model(paths):
	from collections import defaultdict

	d = defaultdict(list)
	for path in paths:
	model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
	d[model_id].append(path)
	return {model_id: sorted(paths) for model_id, paths in d.items()}


	def update_load_results_component():
	return (gr.Button("Load", interactive=True),) * 2


	async def load_results_dataframe(model_id, result_paths_per_model=None):
	if not model_id or not result_paths_per_model:
	return
	result_paths = result_paths_per_model[model_id]
	results = await asyncio.gather(*[load_json_file(path) for path in result_paths])
	data = {"results": {}, "configs": {}}
	for result in results:
	data["results"].update(result["results"])
	data["configs"].update(result["configs"])
	model_name = result.get("model_name", "Model")
	df = pd.json_normalize([data])
	# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
	return df.set_index(pd.Index([model_name])).reset_index()


	async def load_results_dataframes(*model_ids, result_paths_per_model=None):
	result = await asyncio.gather(
	*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
	)
	return result


	def concat_results(dfs):
	dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
	if dfs:
	return pd.concat(dfs)


	def display_results(task, hide_std_errors, show_only_differences, *dfs):
	df = concat_results(dfs)
	if df is None:
	return None, None
	df = df.T.rename_axis(columns=None)
	return (
	display_tab("results", df, task, hide_std_errors=hide_std_errors),
	display_tab("configs", df, task, show_only_differences=show_only_differences),
	)


	def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False):
	if show_only_differences:
	any_difference = df.ne(df.iloc[:, 0], axis=0).any(axis=1)
	df = df.style.format(escape="html", na_rep="")
	# Hide rows
	df.hide(
	[
	row
	for row in df.index
	if (
	not row.startswith(f"{tab}.")
	or row.startswith(f"{tab}.leaderboard.")
	or row.endswith(".alias")
	or (
	not row.startswith(f"{tab}.{task}")
	if task != "All"
	else row.startswith(f"{tab}.leaderboard_arc_challenge")
	)
	# Hide std errors
	or (hide_std_errors and row.endswith("_stderr,none"))
	# Hide non-different rows
	or (show_only_differences and not any_difference[row])
	)
	],
	axis="index",
	)
	# Color metric result cells
	idx = pd.IndexSlice
	colored_rows = idx[
	[
	row
	for row in df.index
	if row.endswith("acc,none") or row.endswith("acc_norm,none") or row.endswith("exact_match,none")
	]
	] # Apply only on numeric cells, otherwise the background gradient will not work
	subset = idx[colored_rows, idx[:]]
	df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
	# Format index values: remove prefix and suffix
	start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
	df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
	return df.to_html()


	def update_tasks_component():
	return (
	gr.Radio(
	["All"] + list(constants.TASKS.values()),
	label="Tasks",
	info="Evaluation tasks to be displayed",
	value="All",
	visible=True,
	),
	) * 2


	def clear_results():
	# model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
	return (
	None,
	None,
	None,
	None,
	(gr.Button("Load", interactive=False),) 2,
	*(
	gr.Radio(
	["All"] + list(constants.TASKS.values()),
	label="Tasks",
	info="Evaluation tasks to be displayed",
	value="All",
	visible=False,
	),
	)
	* 2,
	)


	def display_loading_message_for_results():
	return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2