Weyaxi's picture
Update app.py
0742247
raw
history blame
12.6 kB
import re
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from huggingface_hub import HfApi, list_models, list_datasets, list_spaces
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
import datetime
from openllm import *
print(gr.__version__)
api = HfApi()
def get_most(df_for_most_function):
download_sorted_df = df_for_most_function.sort_values(by=['downloads'], ascending=False)
most_downloaded = download_sorted_df.iloc[0]
like_sorted_df = df_for_most_function.sort_values(by=['likes'], ascending=False)
most_liked = like_sorted_df.iloc[0]
return {"Most Download": {"id": most_downloaded['id'], "downloads": most_downloaded['downloads'], "likes": most_downloaded['likes']}, "Most Likes": {"id": most_liked['id'], "downloads": most_liked['downloads'], "likes": most_liked['likes']}}
def get_sum(df_for_sum_function):
sum_downloads = sum(df_for_sum_function['downloads'].tolist())
sum_likes = sum(df_for_sum_function['likes'].tolist())
return {"Downloads": sum_downloads, "Likes": sum_likes}
def get_openllm_leaderboard():
data = get_json_format_data()
finished_models = get_datas(data)
df = pd.DataFrame(finished_models)
return df['Model'].tolist()
def get_ranking(model_list, target_org):
if model_list == []:
return "Error on Leaderboard"
for index, model in enumerate(model_list):
if model.split("/")[0].lower() == target_org.lower():
return [index+1, model]
return "Not Found"
def get_models(which_one):
if which_one == "models":
data = api.list_models()
elif which_one == "datasets":
data = api.list_datasets()
elif which_one == "spaces":
data = api.list_spaces()
all_list = []
for i in tqdm(data, desc=f"Scraping {which_one}", position=0, leave=True):
i = i.__dict__
id = i["id"].split("/")
if len(id) != 1:
json_format_data = {"author": id[0] ,"id": "/".join(id), "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"author": id[0] ,"id": "/".join(id), "downloads": 0, "likes": i['likes']}
all_list.append(json_format_data)
return all_list
def search(models_dict, author_name):
return pd.DataFrame(models_dict.get(author_name, []))
def group_models_by_author(all_things):
models_by_author = {}
for model in all_things:
author_name = model['author']
if author_name not in models_by_author:
models_by_author[author_name] = []
models_by_author[author_name].append(model)
return models_by_author
def make_leaderboard(orgs, which_one, data):
data_rows = []
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
trend = get_trending_list(1, which_one)
for org in tqdm(orgs, desc=f"Proccesing Organizations ({which_one})", position=0, leave=True):
rank = get_ranking_trend(trend, org)
df = search(data, org)
if len(df) == 0:
continue
num_things = len(df)
sum_info = get_sum(df)
most_info = get_most(df)
if which_one == "models":
open_llm_leaderboard_get_org = get_ranking(open_llm_leaderboard, org)
data_rows.append({
"Organization Name": org,
"Total Downloads": sum_info["Downloads"],
"Total Likes": sum_info["Likes"],
"Number of Models": num_things,
"Best Model On Open LLM Leaderboard": open_llm_leaderboard_get_org[1] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org,
"Best Rank On Open LLM Leaderboard": open_llm_leaderboard_get_org[0] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org,
"Average Downloads per Model": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0,
"Average Likes per Model": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Downloaded Model": most_info["Most Download"]["id"],
"Most Download Count": most_info["Most Download"]["downloads"],
"Most Liked Model": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"],
"Trending Model": rank['id'],
"Best Rank at Trending Models": rank['rank']
})
elif which_one == "datasets":
data_rows.append({
"Organization Name": org,
"Total Downloads": sum_info["Downloads"],
"Total Likes": sum_info["Likes"],
"Number of Datasets": num_things,
"Average Downloads per Dataset": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0,
"Average Likes per Dataset": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Downloaded Dataset": most_info["Most Download"]["id"],
"Most Download Count": most_info["Most Download"]["downloads"],
"Most Liked Dataset": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"],
"Trending Dataset": rank['id'],
"Best Rank at Trending Datasets": rank['rank']
})
elif which_one == "spaces":
data_rows.append({
"Organization Name": org,
"Total Likes": sum_info["Likes"],
"Number of Spaces": num_things,
"Average Likes per Space": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Liked Space": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"],
"Trending Space": rank['id'],
"Best Rank at Trending Spaces": rank['rank']
})
leaderboard = pd.DataFrame(data_rows)
temp = ["Total Downloads"] if which_one != "spaces" else ["Total Likes"]
leaderboard = leaderboard.sort_values(by=temp, ascending=False)
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
return leaderboard
def clickable(x, which_one):
if which_one == "models":
if x != "Not Found":
return f'<a target="_blank" href="https://huggingface.co./{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
else:
return "Not Found"
else:
if x != "Not Found":
return f'<a target="_blank" href="https://huggingface.co./{which_one}/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
return "Not Found"
def models_df_to_clickable(df, columns, which_one):
for column in columns:
if column == "Organization Name":
df[column] = df[column].apply(lambda x: clickable(x, "models"))
else:
df[column] = df[column].apply(lambda x: clickable(x, which_one))
return df
def get_trending_list(pages, which_one):
trending_list = []
for i in range(pages):
json_data = requests.get(f"https://huggingface.co./{which_one}-json?p={i}").json()
for thing in json_data[which_one]:
id = thing["id"]
likes = thing["likes"]
if which_one != "spaces":
downloads = thing["downloads"]
trending_list.append({"id": id, "downloads": downloads, "likes": likes})
else:
trending_list.append({"id": id, "likes": likes})
return trending_list
def get_ranking_trend(json_data, org_name):
names = [item['id'].split("/")[0] for item in json_data]
models = [item['id'] for item in json_data]
if org_name in names:
temp = names.index(org_name)
return {"id": models[temp], "rank": temp+1}
else:
return {"id": "Not Found", "rank": "Not Found"}
def restart_space():
print("Restarting...")
api.restart_space(repo_id="TFLai/organization-leaderboard", token=HF_TOKEN)
with open("org_names.txt", "r") as f:
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
datetime = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
INTRODUCTION_TEXT = f"""
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co./spaces/HuggingFaceH4/open_llm_leaderboard).
## Available Dataframes:
- πŸ›οΈ Models
- πŸ“Š Datasets
- πŸš€ Spaces
## User Leaderboard
You can access our User Leaderboard by visiting this link:
- πŸ”— [User Leaderboard](https://huggingface.co./spaces/PulsarAI/user-leaderboard)
## Backend
πŸ› οΈ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co./docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
πŸ› οΈ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co./organizations).
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co./spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
**🌐 Note:** In trending models/datasets/spaces, first 300 models/datasets/spaces is being retrieved from huggingface.
## Last Update
βŒ› This space is last updated in **{datetime}**.
"""
with gr.Blocks() as demo:
gr.Markdown("""<h1 align="center" id="space-title">πŸ€— Organization Leaderboard</h1>""")
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
all_models = get_models("models")
all_datasets = get_models("datasets")
all_spaces = get_models("spaces")
with gr.TabItem("πŸ›οΈ Models", id=1):
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
models_df = make_leaderboard(org_names_in_list, "models", group_models_by_author(all_models))
models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
headers = ["πŸ”’ Serial Number", "🏒 Organization Name", "πŸ“₯ Total Downloads", "πŸ‘ Total Likes", "πŸ€– Number of Models", "πŸ† Best Model On Open LLM Leaderboard", "πŸ₯‡ Best Rank On Open LLM Leaderboard", "πŸ“Š Average Downloads per Model", "πŸ“ˆ Average Likes per Model", "πŸš€ Most Downloaded Model", "πŸ“ˆ Most Download Count", "❀️ Most Liked Model", "πŸ‘ Most Like Count", "πŸ”₯ Trending Model", "πŸ‘‘ Best Rank at Trending Models"]
gr.Dataframe(models_df.head(400), headers=headers, interactive=True, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"])
with gr.TabItem("πŸ“Š Datasets", id=2):
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
dataset_df = make_leaderboard(org_names_in_list, "datasets", group_models_by_author(all_datasets))
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
headers = ["πŸ”’ Serial Number", "🏒 Organization Name", "πŸ“₯ Total Downloads", "πŸ‘ Total Likes", "πŸ“Š Number of Datasets", "πŸ“Š Average Downloads per Dataset", "πŸ“ˆ Average Likes per Dataset", "πŸš€ Most Downloaded Dataset", "πŸ“ˆ Most Download Count", "❀️ Most Liked Dataset", "πŸ‘ Most Like Count", "πŸ”₯ Trending Dataset", "πŸ‘‘ Best Rank at Trending Datasets"]
gr.Dataframe(dataset_df.head(250), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"])
with gr.TabItem("πŸš€ Spaces", id=3):
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
spaces_df = make_leaderboard(org_names_in_list, "spaces", group_models_by_author(all_spaces))
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
headers = ["πŸ”’ Serial Number", "🏒 Organization Name", "πŸ‘ Total Likes", "πŸš€ Number of Spaces", "πŸ“ˆ Average Likes per Space", "❀️ Most Liked Space", "πŸ‘ Most Like Count", "πŸ”₯ Trending Space", "πŸ‘‘ Best Rank at Trending Spaces"]
gr.Dataframe(spaces_df.head(200), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600) # 6 hours
demo.launch()