ranker / app.py
davanstrien's picture
davanstrien HF staff
fix url
f756530
import gradio as gr
from huggingface_hub import list_spaces, list_models, list_datasets
from cachetools import TTLCache, cached
from toolz import groupby, valmap
import platform
from enum import Enum
is_macos = platform.system() == "Darwin"
LIMIT = 1_000_000 if is_macos else None
NONE_AUTHOR = "HuggingFace Team" # TODO deal with this
class HubRepoType(Enum):
MODEL = "model"
DATASET = "dataset"
SPACE = "space"
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def get_spaces(): # β‰ˆ
return list(list_spaces(full=True, limit=LIMIT))
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def get_models():
return list(iter(list_models(full=True, limit=LIMIT)))
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def get_datasets():
return list(iter(list_datasets(full=True, limit=LIMIT)))
get_spaces() # to warm up the cache
get_models() # to warm up the cache
get_datasets() # to warm up the cache
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def valid_dataset_ids():
return {dataset.id for dataset in get_datasets()}
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def valid_model_ids():
return {model.id for model in get_models()}
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def valid_space_ids():
return {space.id for space in get_spaces()}
VALID_DATASET_IDS = valid_dataset_ids()
VALID_MODEL_IDS = valid_model_ids()
VALID_SPACE_IDS = valid_space_ids()
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_space_to_like_dict():
spaces = get_spaces()
return {space.id: space.likes for space in spaces}
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_org_to_space_like_dict():
spaces = get_spaces()
grouped = groupby(lambda x: x.author, spaces)
return valmap(lambda x: sum(s.likes for s in x), grouped)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_model_to_like_dict(metric_kind):
models = get_models()
if metric_kind == "likes":
return {model.id: model.likes for model in models}
if metric_kind == "downloads":
return {model.id: model.downloads for model in models}
raise ValueError(f"Unsupported metric_kind: {metric_kind}")
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_org_to_model_metrics(metric_kind="likes"):
models = get_models()
# remove authors who are None
models = [model for model in models if model.author is not None]
grouped = groupby(lambda x: x.author, models)
if metric_kind:
return valmap(lambda x: sum(s.likes for s in x), grouped)
else:
return valmap(lambda x: sum(s.downloads for s in x), grouped)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_dataset_to_like_dict(metric_kind="likes"):
datasets = get_datasets()
if metric_kind == "likes":
return {dataset.id: dataset.likes for dataset in datasets}
if metric_kind == "downloads":
return {dataset.id: dataset.downloads for dataset in datasets}
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def create_org_to_dataset_metrics(metric_kind="likes"):
datasets = get_datasets()
# remove authors who are None
datasets = [dataset for dataset in datasets if dataset.author is not None]
grouped = groupby(lambda x: x.author, datasets)
if metric_kind:
return valmap(lambda x: sum(s.likes for s in x), grouped)
else:
return valmap(lambda x: sum(s.downloads for s in x), grouped)
def relative_rank(my_dict, target_key, filter_zero=False):
if filter_zero:
my_dict = {k: v for k, v in my_dict.items() if v != 0}
if target_key not in my_dict:
raise gr.Error(f"'{target_key}' not found please check the ID and try again.")
sorted_items = sorted(my_dict.items(), key=lambda item: item[1], reverse=True)
position = [key for key, _ in sorted_items].index(target_key)
num_lower = len(sorted_items) - position - 1
num_higher = position
return {
"rank": (num_higher + 1) / len(my_dict) * 100,
"num_higher": num_higher,
"num_lower": num_lower,
"value": my_dict[target_key],
"position": num_higher + 1,
}
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_rank_for_space(space_id, filter_zero=False):
space_to_like_dict = create_space_to_like_dict()
return relative_rank(space_to_like_dict, space_id, filter_zero=filter_zero)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_rank_for_model(model_id, metric_kind="likes", filter_zero=False):
model_to_like_dict = create_model_to_like_dict(metric_kind)
return relative_rank(model_to_like_dict, model_id, filter_zero=filter_zero)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_rank_for_dataset(dataset_id, metric_kind="likes", filter_zero=False):
dataset_to_like_dict = create_dataset_to_like_dict(metric_kind)
return relative_rank(dataset_to_like_dict, dataset_id, filter_zero=filter_zero)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_space_rank_for_org(org_id, filter_zero=False):
org_to_like_dict = create_org_to_space_like_dict()
return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_model_rank_for_org(org_id, metric_kind="likes", filter_zero=False):
org_to_like_dict = create_org_to_model_metrics(metric_kind)
return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def relative_dataset_rank_for_org(org_id, metric_kind="likes", filter_zero=False):
org_to_like_dict = create_org_to_dataset_metrics(metric_kind)
return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero)
# @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
# def rank_space(space_id):
# return relative_rank_for_space(space_id)
def rank_space_and_org(space_or_org_id, kind, filter_zero):
filter_zero = filter_zero == "yes"
split_length = len(space_or_org_id.split("/"))
# Logic for split_length == 2
if split_length == 2:
return _rank_single_repo(space_or_org_id, kind, filter_zero)
# Handle kind-specific logic for split_length == 1
if split_length == 1:
valid_ids = {"model": VALID_MODEL_IDS, "dataset": VALID_DATASET_IDS}
if kind in valid_ids and space_or_org_id in valid_ids[kind]:
return _rank_single_repo(space_or_org_id, kind, filter_zero)
else:
return _rank_by_org(space_or_org_id, kind, filter_zero)
# If no conditions match, handle unexpected cases (optional)
raise ValueError(
f"Unexpected combination of space_or_org_id '{space_or_org_id}' and kind"
f" '{kind}'"
)
def _rank_by_org(space_or_org_id, kind, filter_zero):
if kind == "space":
org_rank = relative_space_rank_for_org(space_or_org_id, filter_zero=filter_zero)
elif kind == "model":
org_rank = relative_model_rank_for_org(space_or_org_id, filter_zero=filter_zero)
elif kind == "dataset":
org_rank = relative_dataset_rank_for_org(
space_or_org_id, filter_zero=filter_zero
)
result = (
f"## ⭐️ Org/User {kind.title()} Likes Rankings ⭐️\n"
+ f"Here are the rankings for the org/user across all of their {kind}s \n"
)
result += f"""- You have {org_rank['value']:,} likes for this org/user.\n"""
result += f"""- Your org/user is ranked {org_rank['position']:,}\n"""
result += f"""- You have {org_rank['num_higher']:,} orgs/users above and {org_rank['num_lower']:,} orgs/users below in the ranking of {kind} likes \n\n"""
result += f"""- Organization or user [{space_or_org_id}](https://huggingface.co./{space_or_org_id}) is ranked in the top {org_rank['rank']:.2f}% \n\n"""
if kind == "space":
result += f"""You can find all your Spaces sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_spaces=likes#spaces)\n"""
if kind == "model":
result += f"""You can find all your Models sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_models=likes#models)\n"""
if kind == "dataset":
result += f"""You can find all your Datasets sorted by likes [here](https://huggingface.co./{space_or_org_id}?sort_datasets=likes#datasets)\n"""
return _create_footer_message(result, kind)
def _rank_single_repo(space_or_org_id, kind, filter_zero):
if kind == "space":
repo_rank = relative_rank_for_space(space_or_org_id, filter_zero=filter_zero)
elif kind == "model":
repo_rank = relative_rank_for_model(space_or_org_id, filter_zero=filter_zero)
elif kind == "dataset":
repo_rank = relative_rank_for_dataset(space_or_org_id, filter_zero=filter_zero)
result = f"## ⭐️ {kind.title()} Likes Rankings ⭐️\n"
result += f"""Here are the rankings by likes for [`{space_or_org_id}`](https://huggingface.co./spaces/{space_or_org_id}) across all {kind}s \n"""
result += f"""- You have {repo_rank['value']:,} likes for this {kind}.\n"""
result += f"""- Your {kind} is ranked {repo_rank['position']:,}.\n"""
if kind == "space":
result += f"""- Space [{space_or_org_id}](https://huggingface.co./spaces/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
if kind == "model":
result += f"""- Model [{space_or_org_id}](https://huggingface.co./{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
if kind == "dataset":
result += f"""- Dataset [{space_or_org_id}](https://huggingface.co./dataset/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n"""
result += f"""- You have {repo_rank['num_higher']:,} {kind}s above and {repo_rank['num_lower']:,} {kind}s below in the ranking of {kind}s likes\n\n"""
return _create_footer_message(result, kind)
def _create_footer_message(result, kind):
result += """### ✨ Remember likes aren't everything!✨\n"""
if kind == "space":
result += """Some Spaces go very viral whilst other Spaces may be very useful for a smaller audience. If you think your Space is useful, please add it to this [thread](https://huggingface.co./spaces/librarian-bots/ranker/discussions/3) of awesome Spaces.
We'll look out for awesome Spaces added to this thread to promote more widely!"""
return result
def get_top_n_orgs_and_users_spaces(top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
orgs_to_likes = create_org_to_space_like_dict()
sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
sorted_items = sorted_items[:top_n]
return sorted_items
def get_top_n_orgs_and_users_models(metric, top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
orgs_to_likes = create_org_to_model_metrics(metric)
sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
sorted_items = sorted_items[:top_n]
return sorted_items
def get_top_n_orgs_and_users_datasets(metric, top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
orgs_to_likes = create_org_to_dataset_metrics(metric)
sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True)
sorted_items = sorted_items[:top_n]
return sorted_items
def plot_top_n_orgs_and_users(kind, metric="likes", top_n=100):
if kind == "space":
top_n = get_top_n_orgs_and_users_spaces(top_n)
header = """## πŸ… Top 100 Orgs and Users by Space Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
for i, (org, likes) in enumerate(top_n)
)
return header + body
elif kind == "model":
top_n = get_top_n_orgs_and_users_models(metric, top_n=top_n)
header = """## πŸ… Top 100 Orgs and Users by Model Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
for i, (org, likes) in enumerate(top_n)
)
return header + body
elif kind == "dataset":
top_n = get_top_n_orgs_and_users_datasets(metric, top_n=top_n)
header = """## πŸ… Top 100 Orgs and Users by Dataset Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{org}](https://huggingface.co./{org}) with {likes:,} likes"
for i, (org, likes) in enumerate(top_n)
)
return header + body
def get_top_n_spaces(top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
space_to_likes = create_space_to_like_dict()
sorted_items = sorted(
space_to_likes.items(), key=lambda item: item[1], reverse=True
)
sorted_items = sorted_items[:top_n]
return sorted_items
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def get_top_n_models(metric_kind, top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
model_to_likes = create_model_to_like_dict(metric_kind)
sorted_items = sorted(
model_to_likes.items(), key=lambda item: item[1], reverse=True
)
sorted_items = sorted_items[:top_n]
return sorted_items
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
def get_top_n_datasets(metric, top_n=100):
# gr.Info("Updating leaderboard, this may take a few seconds...")
dataset_to_likes = create_dataset_to_like_dict(metric)
sorted_items = sorted(
dataset_to_likes.items(), key=lambda item: item[1], reverse=True
)
sorted_items = sorted_items[:top_n]
return sorted_items
def _plot_top_n_hub_repos(kind: HubRepoType, metric="likes", top_n=100):
if kind == HubRepoType.SPACE:
top_n = get_top_n_spaces(top_n)
header = """## πŸ… Top 100 Space repositories by Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{space}](https://huggingface.co./spaces/{space}) with"
f" {likes:,} likes"
for i, (space, likes) in enumerate(top_n)
)
return header + body
elif kind == HubRepoType.MODEL:
top_n = get_top_n_models(metric, top_n)
header = """## πŸ… Top 100 Model repositories by Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{model}](https://huggingface.co./{model}) with"
f" {likes:,} likes"
for i, (model, likes) in enumerate(top_n)
)
return header + body
elif kind == HubRepoType.DATASET:
top_n = get_top_n_datasets(metric, top_n)
header = """## πŸ… Top 100 Dataset repositories by Likes πŸ…"""
body = "".join(
f"\n{i+1}. [{dataset}](https://huggingface.co./dataset/{dataset}) with"
f" {likes:,} likes"
for i, (dataset, likes) in enumerate(top_n)
)
return header + body
def plot_top_n_hub_repos(kind, metric_kind="likes", top_n=100):
if kind == "space":
return _plot_top_n_hub_repos(HubRepoType.SPACE, top_n)
elif kind == "model":
return _plot_top_n_hub_repos(HubRepoType.MODEL, metric=metric_kind, top_n=top_n)
elif kind == "dataset":
return _plot_top_n_hub_repos(
HubRepoType.DATASET, metric=metric_kind, top_n=top_n
)
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'> &#127942; HuggyRanker &#127942; </h1>")
gr.HTML(
"""<p style='text-align: center;'>Rank a single repository or all of the repositories created by an organization or user by likes</p>"""
)
gr.HTML(
"""<p style="text-align: center;"><i>Remember likes aren't everything!</i></p>"""
)
gr.Markdown(
"""## Rank Specific Hub repositories or rank an organization or user by likes
Provide this app with a Hub ID e.g. `librarian-bots/ranker` or a Username/Organization name e.g. `librarian-bots` to rank by likes."""
)
with gr.Row():
space_id = gr.Textbox(
"librarian-bots", max_lines=1, label="Space or user/organization ID"
)
filter_zero_likes = gr.Radio(
choices=["no", "yes"],
label="Filter out repositories with 0 likes in the ranking?",
value="yes",
)
repo_type = gr.Radio(
choices=["space", "model", "dataset"],
label="Type of repo",
value="space",
interactive=True,
)
run_btn = gr.Button("Show ranking for this Space or org/user!", label="Rank Space")
result = gr.Markdown()
run_btn.click(
rank_space_and_org,
inputs=[space_id, repo_type, filter_zero_likes],
outputs=result,
)
gr.Markdown("## Leaderboard of Top 100 Spaces and Orgs/Users by Likes")
gr.Markdown(
"""The leaderboard is updated every 30 minutes.
Choose the type of repo to rank by likes and click the button to show the leaderboard."""
)
show_refresh_btn = gr.Button("Show/refresh Leaderboard", label="Refresh")
with gr.Row():
with gr.Accordion("Show rankings for Orgs and Users", open=False):
org_user_ranking = gr.Markdown()
show_refresh_btn.click(
plot_top_n_orgs_and_users, inputs=[repo_type], outputs=org_user_ranking
)
with gr.Accordion("Show rankings for individual repositories", open=False):
repo_level_ranking = gr.Markdown()
show_refresh_btn.click(
plot_top_n_hub_repos, inputs=[repo_type], outputs=repo_level_ranking
)
demo.queue(concurrency_count=4).launch()