multimodal-clem-leaderboard

Running

App Files Files Community

multimodal-clem-leaderboard / src /utils.py

sherzod-hakimov

adapt short model names

250513d 10 months ago

raw

history blame

No virus

8.41 kB

	import os
	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np

	from src.assets.text_content import SHORT_NAMES

	def update_cols(df: pd.DataFrame) -> pd.DataFrame:
	'''
	Change three header rows to a single header row
	Args:
	df: Raw dataframe containing 3 separate header rows
	Remove this function if the dataframe has only one header row

	Returns:
	df: Updated dataframe which has only 1 header row instead of 3
	'''
	default_cols = list(df.columns)

	# First 4 columns are initalised in 'update', Append additional columns for games Model, Clemscore, ALL(PLayed) and ALL(Main Score)
	update = ['Model', 'Clemscore', 'Played', 'Quality Score']
	game_metrics = default_cols[4:]

	# Change columns Names for each Game
	for i in range(len(game_metrics)):
	if i%3 == 0:
	game = game_metrics[i]
	update.append(str(game).capitalize() + "(Played)")
	update.append(str(game).capitalize() + "(Quality Score)")
	update.append(str(game).capitalize() + "(Quality Score[std])")

	# Create a dict to change names of the columns
	map_cols = {}
	for i in range(len(default_cols)):
	map_cols[default_cols[i]] = str(update[i])

	df = df.rename(columns=map_cols)
	df = df.iloc[2:]

	return df

	def process_df(df: pd.DataFrame) -> pd.DataFrame:
	'''
	Process dataframe - Remove repition in model names, convert datatypes to sort by "float" instead of "str"
	Args:
	df: Unprocessed Dataframe (after using update_cols)
	Returns:
	df: Processed Dataframe
	'''

	# Change column type to float from str
	list_column_names = list(df.columns)
	model_col_name = list_column_names[0]
	for col in list_column_names:
	if col != model_col_name:
	df[col] = df[col].astype(float)

	# Remove repetition in model names, if any
	models_list = []
	for i in range(len(df)):
	model_name = df.iloc[i][model_col_name]
	splits = model_name.split('--')
	splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
	if splits[0] == splits[1]:
	models_list.append(splits[0])
	else:
	models_list.append(splits[0] + "--" + splits[1])
	df[model_col_name] = models_list

	return df

	def get_data(path: str, flag: bool):
	'''
	Get a list of all version names and respective Dataframes
	Args:
	path: Path to the directory containing CSVs of different versions -> v0.9.csv, v1.0.csv, ....
	flag: Set this flag to include the latest version in Details and Versions tab
	Returns:
	latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
	latest_vname: list of the name of latest version
	previous_df: list of dataframes for previous versions (can skip latest version if required)
	previous_vname: list of the names for the previous versions (INCLUDED IN Details and Versions Tab)

	'''
	# Check if Directory is empty
	list_versions = os.listdir(path)
	if not list_versions:
	print("Directory is empty")

	else:
	files = [file for file in list_versions if file.endswith('.csv')]
	files.sort(reverse=True)
	file_names = [os.path.splitext(file)[0] for file in files]

	DFS = []
	for file in files:
	df = pd.read_csv(os.path.join(path, file))
	df = update_cols(df) # Remove if by default there is only one header row
	df = process_df(df) # Process Dataframe
	df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
	DFS.append(df)

	# Only keep relavant columns for the main leaderboard
	latest_df_dummy = DFS[0]
	all_columns = list(latest_df_dummy.columns)
	keep_columns = all_columns[0:4]
	latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])

	latest_df = [latest_df_dummy]
	latest_vname = [file_names[0]]
	previous_df = []
	previous_vname = []
	for df, name in zip(DFS, file_names):
	previous_df.append(df)
	previous_vname.append(name)

	if not flag:
	previous_df.pop(0)
	previous_vname.pop(0)

	return latest_df, latest_vname, previous_df, previous_vname

	return None


	# ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
	def compare_plots(df: pd.DataFrame, LIST: list):
	'''
	Quality Score v/s % Played plot by selecting models
	Args:
	LIST: The list of models to show in the plot, updated from frontend
	Returns:
	fig: The plot
	'''
	short_names = label_map(LIST)

	list_columns = list(df.columns)
	df = df[df[list_columns[0]].isin(LIST)]

	X = df[list_columns[2]]
	fig, ax = plt.subplots()
	for model in LIST:
	short = short_names[model]
	# same_flag = short_names[model][1]
	model_df = df[df[list_columns[0]] == model]
	x = model_df[list_columns[2]]
	y = model_df[list_columns[3]]
	color = plt.cm.rainbow(x / max(X)) # Use a colormap for different colors
	plt.scatter(x, y, color=color)
	# if same_flag:
	plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(0, -15), ha='center', rotation=0)
	# else:
	# plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(20, -3), ha='center', rotation=0)
	ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)
	ax.set_xticks(np.arange(0,110,10))
	plt.xlim(-10, 110)
	plt.ylim(-10, 110)
	plt.xlabel('% Played')
	plt.ylabel('Quality Score')
	plt.title('Overview of benchmark results')
	plt.show()

	return fig

	def shorten_model_name(full_name):
	# Split the name into parts
	parts = full_name.split('-')

	# Process the name parts to keep only the parts with digits (model sizes and versions)
	short_name_parts = [part for part in parts if any(char.isdigit() for char in part)]

	if len(parts) == 1:
	short_name = ''.join(full_name[0:min(3, len(full_name))])
	else:
	# Join the parts to form the short name
	short_name = '-'.join(short_name_parts)

	# Remove any leading or trailing hyphens
	short_name = full_name[0] + '-'+ short_name.strip('-')

	return short_name

	def label_map(model_list: list) -> dict:
	'''
	Generate a map from long names to short names, to plot them in frontend graph
	Define the short names in src/assets/text_content.py
	Args:
	model_list: A list of long model names
	Returns:
	short_name: A map from long to list of short name + indication if models are same or different
	'''
	short_names = {}
	for model_name in model_list:
	# splits = model_name.split('--')
	# if len(splits) != 1:
	# splits[0] = SHORT_NAMES[splits[0] + '-']
	# splits[1] = SHORT_NAMES[splits[1] + '-']
	# # Define the short name and indicate there are two different models
	# short_names[model_name] = [splits[0] + '--' + splits[1], 0]
	# else:
	if model_name in SHORT_NAMES:
	short_name = SHORT_NAMES[model_name]
	else:
	short_name = shorten_model_name(model_name)

	# Define the short name and indicate both models are same
	short_names[model_name] = short_name

	return short_names

	def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
	'''
	Filter the dataframe based on the search query
	Args:
	df: Unfiltered dataframe
	query: a string of queries separated by ";"
	Return:
	filtered_df: Dataframe containing searched queries in the 'Model' column
	'''
	queries = query.split(';')
	list_cols = list(df.columns)
	df_len = len(df)
	filtered_models = []
	models_list = list(df[list_cols[0]])
	for q in queries:
	q = q.lower()
	for i in range(df_len):
	model_name = models_list[i]
	if q in model_name.lower():
	filtered_models.append(model_name) # Append model names containing query q

	filtered_df = df[df[list_cols[0]].isin(filtered_models)]

	if query == "":
	return df

	return filtered_df