import os
from datetime import datetime, timedelta, timezone
from typing import Any, Dict
import gradio as gr
import pandas as pd
from cachetools import TTLCache, cached
from dotenv import load_dotenv
from httpx import Client
from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map
load_dotenv()
LIMIT = 5_000
CACHE_TIME = 60 * 60 * 12 # 12 hours
REMOVE_ORGS = {
"HuggingFaceM4",
"HuggingFaceBR4",
"open-llm-leaderboard",
"TrainingDataPro",
}
HF_TOKEN = os.getenv("HF_TOKEN")
USER_AGENT = os.getenv("USER_AGENT")
if not HF_TOKEN or not USER_AGENT:
raise ValueError(
"Missing required environment variables. Please ensure both HF_TOKEN and USER_AGENT are set."
)
headers = {"authorization": f"Bearer {HF_TOKEN}", "user-agent": USER_AGENT}
client = Client(
headers=headers,
timeout=120,
)
# LOCAL = False
# if platform == "darwin":
# LOCAL = True
# cache_dir = "cache" if LOCAL else "/data/diskcache"
# cache = Cache(cache_dir)
cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
def get_three_months_ago():
now = datetime.now(timezone.utc)
return now - timedelta(days=90)
def add_created_data(dataset):
_id = dataset._id
created = dataset.created_at
dataset_dict = dataset.__dict__
dataset_dict["createdAt"] = created
return dataset_dict
def get_readme_len(dataset: Dict[str, Any]):
try:
url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
resp = client.get(url)
if resp.status_code == 200:
card = DatasetCard(resp.text)
dataset["len"] = len(card.text)
return dataset
except Exception as e:
print(e)
return None
def check_ds_server_valid(id):
url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}"
response = client.get(url)
if response.status_code != 200:
return False
try:
data = response.json()
preview = data.get("preview")
return preview is not None
except Exception as e:
print(e)
return False
def has_server_preview(dataset):
dataset["server_preview"] = check_ds_server_valid(dataset["id"])
return dataset
def render_model_hub_link(hub_id):
link = f"https://huggingface.co./datasets/{hub_id}"
return (
f'{hub_id}'
)
@cached(cache)
def get_datasets():
return list(
tqdm(
iter(list_datasets(limit=LIMIT, full=True, sort="createdAt", direction=-1))
)
)
@cached(cache)
def load_data():
datasets = get_datasets()
datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
# datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
ds_with_len = thread_map(get_readme_len, filtered)
ds_with_len = [ds for ds in ds_with_len if ds is not None]
ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None]
return ds_with_valid_status
columns_to_drop = [
"cardData",
"gated",
"sha",
"tags",
"description",
"siblings",
"disabled",
"_id",
"private",
"author",
# "citation",
"lastModified",
]
def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop):
ds_with_len = load_data()
if remove_orgs_and_users:
ds_with_len = [
ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users
]
df = pd.DataFrame(ds_with_len)
df["id"] = df["id"].apply(render_model_hub_link)
if columns_to_drop:
df = df.drop(columns=columns_to_drop)
df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
return df
def filter_df_by_max_age(df, max_age_days=None):
df = df.dropna(subset=["createdAt"])
now = datetime.now(timezone.utc)
if max_age_days is not None:
max_date = now - timedelta(days=max_age_days)
df = df[df["createdAt"] >= max_date]
return df
def filter_by_readme_len(df, min_len=None):
if min_len is not None:
df = df[df["len"] >= min_len]
return df
def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False):
try:
df = prep_dataframe()
if needs_server_preview:
df = df[df["server_preview"] == True]
if max_age_days is not None:
df = filter_df_by_max_age(df, max_age_days=max_age_days)
if min_len is not None:
df = filter_by_readme_len(df, min_len=min_len)
df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
return df
except Exception as e:
print(f"Error filtering dataframe: {str(e)}")
# Return empty dataframe with same columns if there's an error
return pd.DataFrame(
columns=["id", "likes", "downloads", "len", "createdAt", "server_preview"]
)
with gr.Blocks() as demo:
gr.Markdown("# Recent Datasets on the Hub")
gr.Markdown(
"Datasets added in the past 90 days with a README.md and some metadata."
)
with gr.Row():
max_age_days = gr.Slider(
label="Max Age (days)",
value=7,
minimum=0,
maximum=90,
step=1,
interactive=True,
)
min_len = gr.Slider(
label="Minimum README Length",
value=300,
minimum=0,
maximum=1000,
step=50,
interactive=True,
)
needs_server_preview = gr.Checkbox(
label="Exclude datasets without datasets-server preview?",
value=False,
interactive=True,
)
# gr.Markdown(
# """
#
# """
# )
output = gr.DataFrame(
value=filter_df(7, 300, False), # Set initial values explicitly
interactive=False,
datatype="markdown",
min_width=160 * 2.5,
elem_id="dataset_table",
)
def update_df(age, length, preview):
return filter_df(age, length, preview)
# Use a single update function for all inputs
for component in [max_age_days, min_len, needs_server_preview]:
component.change(
fn=update_df,
inputs=[max_age_days, min_len, needs_server_preview],
outputs=[output],
)
demo.launch()