import os import gradio as gr import pandas as pd import time import threading from huggingface_hub import HfApi from humanize import naturalsize api = HfApi() HF_TOKEN = os.getenv('HF_TOKEN') def clickable(x): return f'{x}' def apply_headers(df, headers): tmp = df.copy() tmp.columns = headers return tmp def search(search_text): if not search_text: return df return df[df['👤 Author'].str.contains(search_text, case=False, na=False)] df = pd.read_csv("author_data_hf_merged.csv") df_author_copy = df.copy() df["author"] = df["author"].apply(lambda x: clickable(x)) df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1) df = df.sort_values(by='Total Usage', ascending=False) sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist())) naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces'] df[naturalsize_columns] = df[naturalsize_columns].map(naturalsize) df['Serial Number'] = [i for i in range(1, len(df)+1)] df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]] df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"]) desc = f""" 🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface. ## 📄 Information 🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co./spaces/Weyaxi/huggingface-leaderboard). These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co./spaces/Weyaxi/huggingface-leaderboard) positions: - 🤖 Top 60k authors in the models category - 📊 Top 60k authors in the datasets category - 🚀 Top 50k authors in the spaces category ## 📒 Notes Note that these numbers may not be entirely accurate due to the following reasons: - I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen. - There may be large datasets/models to which I don't have access (either private or gated). # 📶 Total Data Usage From All Authors According to this leaderboard, there is a total of {sum_all_author} of data on this platform. """ # Write note maybe? title = """