Spaces:

librarian-bots
/

ranker

Runtime error

App Files Files Community

ranker / app.py

davanstrien HF staff

fix url

f756530 about 1 year ago

raw

history blame contribute delete

17.6 kB

	import gradio as gr
	from huggingface_hub import list_spaces, list_models, list_datasets
	from cachetools import TTLCache, cached
	from toolz import groupby, valmap

	import platform
	from enum import Enum

	is_macos = platform.system() == "Darwin"
	LIMIT = 1_000_000 if is_macos else None
	NONE_AUTHOR = "HuggingFace Team" # TODO deal with this


	class HubRepoType(Enum):
	MODEL = "model"
	DATASET = "dataset"
	SPACE = "space"


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def get_spaces(): # ≈
	return list(list_spaces(full=True, limit=LIMIT))


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def get_models():
	return list(iter(list_models(full=True, limit=LIMIT)))


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def get_datasets():
	return list(iter(list_datasets(full=True, limit=LIMIT)))


	get_spaces() # to warm up the cache
	get_models() # to warm up the cache
	get_datasets() # to warm up the cache


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def valid_dataset_ids():
	return {dataset.id for dataset in get_datasets()}


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def valid_model_ids():
	return {model.id for model in get_models()}


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def valid_space_ids():
	return {space.id for space in get_spaces()}


	VALID_DATASET_IDS = valid_dataset_ids()
	VALID_MODEL_IDS = valid_model_ids()
	VALID_SPACE_IDS = valid_space_ids()


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_space_to_like_dict():
	spaces = get_spaces()
	return {space.id: space.likes for space in spaces}


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_org_to_space_like_dict():
	spaces = get_spaces()
	grouped = groupby(lambda x: x.author, spaces)
	return valmap(lambda x: sum(s.likes for s in x), grouped)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_model_to_like_dict(metric_kind):
	models = get_models()
	if metric_kind == "likes":
	return {model.id: model.likes for model in models}
	if metric_kind == "downloads":
	return {model.id: model.downloads for model in models}
	raise ValueError(f"Unsupported metric_kind: {metric_kind}")


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_org_to_model_metrics(metric_kind="likes"):
	models = get_models()
	# remove authors who are None
	models = [model for model in models if model.author is not None]
	grouped = groupby(lambda x: x.author, models)
	if metric_kind:
	return valmap(lambda x: sum(s.likes for s in x), grouped)
	else:
	return valmap(lambda x: sum(s.downloads for s in x), grouped)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_dataset_to_like_dict(metric_kind="likes"):
	datasets = get_datasets()
	if metric_kind == "likes":
	return {dataset.id: dataset.likes for dataset in datasets}
	if metric_kind == "downloads":
	return {dataset.id: dataset.downloads for dataset in datasets}


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def create_org_to_dataset_metrics(metric_kind="likes"):
	datasets = get_datasets()
	# remove authors who are None
	datasets = [dataset for dataset in datasets if dataset.author is not None]
	grouped = groupby(lambda x: x.author, datasets)
	if metric_kind:
	return valmap(lambda x: sum(s.likes for s in x), grouped)
	else:
	return valmap(lambda x: sum(s.downloads for s in x), grouped)


	def relative_rank(my_dict, target_key, filter_zero=False):
	if filter_zero:
	my_dict = {k: v for k, v in my_dict.items() if v != 0}

	if target_key not in my_dict:
	raise gr.Error(f"'{target_key}' not found please check the ID and try again.")

	sorted_items = sorted(my_dict.items(), key=lambda item: item[1], reverse=True)

	position = [key for key, _ in sorted_items].index(target_key)
	num_lower = len(sorted_items) - position - 1
	num_higher = position
	return {
	"rank": (num_higher + 1) / len(my_dict) * 100,
	"num_higher": num_higher,
	"num_lower": num_lower,
	"value": my_dict[target_key],
	"position": num_higher + 1,
	}


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_rank_for_space(space_id, filter_zero=False):
	space_to_like_dict = create_space_to_like_dict()
	return relative_rank(space_to_like_dict, space_id, filter_zero=filter_zero)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_rank_for_model(model_id, metric_kind="likes", filter_zero=False):
	model_to_like_dict = create_model_to_like_dict(metric_kind)
	return relative_rank(model_to_like_dict, model_id, filter_zero=filter_zero)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_rank_for_dataset(dataset_id, metric_kind="likes", filter_zero=False):
	dataset_to_like_dict = create_dataset_to_like_dict(metric_kind)
	return relative_rank(dataset_to_like_dict, dataset_id, filter_zero=filter_zero)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_space_rank_for_org(org_id, filter_zero=False):
	org_to_like_dict = create_org_to_space_like_dict()
	return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_model_rank_for_org(org_id, metric_kind="likes", filter_zero=False):
	org_to_like_dict = create_org_to_model_metrics(metric_kind)
	return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def relative_dataset_rank_for_org(org_id, metric_kind="likes", filter_zero=False):
	org_to_like_dict = create_org_to_dataset_metrics(metric_kind)
	return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)


	# @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	# def rank_space(space_id):
	# return relative_rank_for_space(space_id)


	def rank_space_and_org(space_or_org_id, kind, filter_zero):
	filter_zero = filter_zero == "yes"
	split_length = len(space_or_org_id.split("/"))

	# Logic for split_length == 2
	if split_length == 2:
	return _rank_single_repo(space_or_org_id, kind, filter_zero)

	# Handle kind-specific logic for split_length == 1
	if split_length == 1:
	valid_ids = {"model": VALID_MODEL_IDS, "dataset": VALID_DATASET_IDS}

	if kind in valid_ids and space_or_org_id in valid_ids[kind]:
	return _rank_single_repo(space_or_org_id, kind, filter_zero)
	else:
	return _rank_by_org(space_or_org_id, kind, filter_zero)

	# If no conditions match, handle unexpected cases (optional)
	raise ValueError(
	f"Unexpected combination of space_or_org_id '{space_or_org_id}' and kind"
	f" '{kind}'"
	)


	def _rank_by_org(space_or_org_id, kind, filter_zero):
	if kind == "space":
	org_rank = relative_space_rank_for_org(space_or_org_id, filter_zero=filter_zero)
	elif kind == "model":
	org_rank = relative_model_rank_for_org(space_or_org_id, filter_zero=filter_zero)
	elif kind == "dataset":
	org_rank = relative_dataset_rank_for_org(
	space_or_org_id, filter_zero=filter_zero
	)
	result = (
	f"## ⭐️ Org/User {kind.title()} Likes Rankings ⭐️\n"
	+ f"Here are the rankings for the org/user across all of their {kind}s \n"
	)
	result += f"""- You have {org_rank['value']:,} likes for this org/user.\n"""
	result += f"""- Your org/user is ranked {org_rank['position']:,}\n"""
	result += f"""- You have {org_rank['num_higher']:,} orgs/users above and {org_rank['num_lower']:,} orgs/users below in the ranking of {kind} likes \n\n"""
	result += f"""- Organization or user [{space_or_org_id}](https://huggingface.co./{space_or_org_id}) is ranked in the top {org_rank['rank']:.2f}% \n\n"""
	if kind == "space":
	result += f"""You can find all your Spaces sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_spaces=likes#spaces)\n"""
	if kind == "model":
	result += f"""You can find all your Models sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_models=likes#models)\n"""
	if kind == "dataset":
	result += f"""You can find all your Datasets sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_datasets=likes#datasets)\n"""
	return _create_footer_message(result, kind)


	def _rank_single_repo(space_or_org_id, kind, filter_zero):
	if kind == "space":
	repo_rank = relative_rank_for_space(space_or_org_id, filter_zero=filter_zero)
	elif kind == "model":
	repo_rank = relative_rank_for_model(space_or_org_id, filter_zero=filter_zero)
	elif kind == "dataset":
	repo_rank = relative_rank_for_dataset(space_or_org_id, filter_zero=filter_zero)
	result = f"## ⭐️ {kind.title()} Likes Rankings ⭐️\n"
	result += f"""Here are the rankings by likes for [`{space_or_org_id}`](https://huggingface.co./spaces/{space_or_org_id}) across all {kind}s \n"""
	result += f"""- You have {repo_rank['value']:,} likes for this {kind}.\n"""
	result += f"""- Your {kind} is ranked {repo_rank['position']:,}.\n"""
	if kind == "space":
	result += f"""- Space [{space_or_org_id}](https://huggingface.co./spaces/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
	if kind == "model":
	result += f"""- Model [{space_or_org_id}](https://huggingface.co./{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
	if kind == "dataset":
	result += f"""- Dataset [{space_or_org_id}](https://huggingface.co./dataset/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
	result += f"""- You have {repo_rank['num_higher']:,} {kind}s above and {repo_rank['num_lower']:,} {kind}s below in the ranking of {kind}s likes\n\n"""
	return _create_footer_message(result, kind)


	def _create_footer_message(result, kind):
	result += """### ✨ Remember likes aren't everything!✨\n"""
	if kind == "space":
	result += """Some Spaces go very viral whilst other Spaces may be very useful for a smaller audience. If you think your Space is useful, please add it to this [thread](https://huggingface.co./spaces/librarian-bots/ranker/discussions/3) of awesome Spaces.
	We'll look out for awesome Spaces added to this thread to promote more widely!"""
	return result


	def get_top_n_orgs_and_users_spaces(top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	orgs_to_likes = create_org_to_space_like_dict()
	sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	def get_top_n_orgs_and_users_models(metric, top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	orgs_to_likes = create_org_to_model_metrics(metric)
	sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	def get_top_n_orgs_and_users_datasets(metric, top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	orgs_to_likes = create_org_to_dataset_metrics(metric)
	sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	def plot_top_n_orgs_and_users(kind, metric="likes", top_n=100):
	if kind == "space":
	top_n = get_top_n_orgs_and_users_spaces(top_n)
	header = """## 🏅 Top 100 Orgs and Users by Space Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
	for i, (org, likes) in enumerate(top_n)
	)
	return header + body

	elif kind == "model":
	top_n = get_top_n_orgs_and_users_models(metric, top_n=top_n)
	header = """## 🏅 Top 100 Orgs and Users by Model Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
	for i, (org, likes) in enumerate(top_n)
	)
	return header + body
	elif kind == "dataset":
	top_n = get_top_n_orgs_and_users_datasets(metric, top_n=top_n)
	header = """## 🏅 Top 100 Orgs and Users by Dataset Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
	for i, (org, likes) in enumerate(top_n)
	)
	return header + body


	def get_top_n_spaces(top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	space_to_likes = create_space_to_like_dict()
	sorted_items = sorted(
	space_to_likes.items(), key=lambda item: item[1], reverse=True
	)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def get_top_n_models(metric_kind, top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	model_to_likes = create_model_to_like_dict(metric_kind)
	sorted_items = sorted(
	model_to_likes.items(), key=lambda item: item[1], reverse=True
	)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
	def get_top_n_datasets(metric, top_n=100):
	# gr.Info("Updating leaderboard, this may take a few seconds...")
	dataset_to_likes = create_dataset_to_like_dict(metric)
	sorted_items = sorted(
	dataset_to_likes.items(), key=lambda item: item[1], reverse=True
	)
	sorted_items = sorted_items[:top_n]
	return sorted_items


	def _plot_top_n_hub_repos(kind: HubRepoType, metric="likes", top_n=100):
	if kind == HubRepoType.SPACE:
	top_n = get_top_n_spaces(top_n)
	header = """## 🏅 Top 100 Space repositories by Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{space}](https://huggingface.co./spaces/{space}) with"
	f" {likes:,} likes"
	for i, (space, likes) in enumerate(top_n)
	)
	return header + body
	elif kind == HubRepoType.MODEL:
	top_n = get_top_n_models(metric, top_n)
	header = """## 🏅 Top 100 Model repositories by Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{model}](https://huggingface.co./{model}) with"
	f" {likes:,} likes"
	for i, (model, likes) in enumerate(top_n)
	)
	return header + body
	elif kind == HubRepoType.DATASET:
	top_n = get_top_n_datasets(metric, top_n)
	header = """## 🏅 Top 100 Dataset repositories by Likes 🏅"""
	body = "".join(
	f"\n{i+1}. [{dataset}](https://huggingface.co./dataset/{dataset}) with"
	f" {likes:,} likes"
	for i, (dataset, likes) in enumerate(top_n)
	)
	return header + body


	def plot_top_n_hub_repos(kind, metric_kind="likes", top_n=100):
	if kind == "space":
	return _plot_top_n_hub_repos(HubRepoType.SPACE, top_n)
	elif kind == "model":
	return _plot_top_n_hub_repos(HubRepoType.MODEL, metric=metric_kind, top_n=top_n)
	elif kind == "dataset":
	return _plot_top_n_hub_repos(
	HubRepoType.DATASET, metric=metric_kind, top_n=top_n
	)


	with gr.Blocks() as demo:
	gr.HTML("<h1 style='text-align: center;'> 🏆 HuggyRanker 🏆 </h1>")
	gr.HTML(
	"""<p style='text-align: center;'>Rank a single repository or all of the repositories created by an organization or user by likes</p>"""
	)
	gr.HTML(
	"""<p style="text-align: center;"><i>Remember likes aren't everything!</i></p>"""
	)
	gr.Markdown(
	"""## Rank Specific Hub repositories or rank an organization or user by likes
	Provide this app with a Hub ID e.g. `librarian-bots/ranker` or a Username/Organization name e.g. `librarian-bots` to rank by likes."""
	)
	with gr.Row():
	space_id = gr.Textbox(
	"librarian-bots", max_lines=1, label="Space or user/organization ID"
	)
	filter_zero_likes = gr.Radio(
	choices=["no", "yes"],
	label="Filter out repositories with 0 likes in the ranking?",
	value="yes",
	)
	repo_type = gr.Radio(
	choices=["space", "model", "dataset"],
	label="Type of repo",
	value="space",
	interactive=True,
	)
	run_btn = gr.Button("Show ranking for this Space or org/user!", label="Rank Space")
	result = gr.Markdown()
	run_btn.click(
	rank_space_and_org,
	inputs=[space_id, repo_type, filter_zero_likes],
	outputs=result,
	)
	gr.Markdown("## Leaderboard of Top 100 Spaces and Orgs/Users by Likes")
	gr.Markdown(
	"""The leaderboard is updated every 30 minutes.
	Choose the type of repo to rank by likes and click the button to show the leaderboard."""
	)
	show_refresh_btn = gr.Button("Show/refresh Leaderboard", label="Refresh")
	with gr.Row():
	with gr.Accordion("Show rankings for Orgs and Users", open=False):
	org_user_ranking = gr.Markdown()
	show_refresh_btn.click(
	plot_top_n_orgs_and_users, inputs=[repo_type], outputs=org_user_ranking
	)
	with gr.Accordion("Show rankings for individual repositories", open=False):
	repo_level_ranking = gr.Markdown()
	show_refresh_btn.click(
	plot_top_n_hub_repos, inputs=[repo_type], outputs=repo_level_ranking
	)
	demo.queue(concurrency_count=4).launch()