![davanstrien's picture](https://cdn-avatars.huggingface.co/v1/production/uploads/1627505688463-60107b385ac3e86b3ea4fc34.jpeg)
davanstrien
HF staff
chore: Update Hugging Face dataset dashboard with total number of authors
bb1ee8a
import gradio as gr | |
import httpx | |
from toolz import groupby | |
import plotly.express as px | |
import pandas as pd | |
def fetch_data(framework): | |
r = httpx.get(f"https://huggingface.co./api/datasets?filter={framework}") | |
data = r.json() | |
grouped = groupby(lambda x: x["author"], data) | |
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)) | |
return data, grouped | |
def generate_dashboard(data, grouped, framework): | |
total_datasets = sum(len(v) for v in grouped.values()) | |
dashboard = f"## Hugging Face Datasets for {framework} \n\n" | |
dashboard += f"**Total number of datasets: {total_datasets}**\n\n" | |
dashboard += f"**Total number of authors: {len(grouped)}**\n\n" | |
dashboard += "### Datasets per Author\n\n" | |
for k, v in grouped.items(): | |
dashboard += f"- **Author:** [{k}](https://huggingface.co./{k})\n" | |
dashboard += f" - **Number of datasets:** {len(v)}\n" | |
return dashboard | |
def plot_datasets_growth(data, framework): | |
df = pd.DataFrame(data) | |
df["createdAt"] = pd.to_datetime(df["createdAt"]) | |
df["month"] = df["createdAt"].dt.to_period("M").astype(str) | |
df_counts = df.groupby("month").size().reset_index(name="count") | |
df_counts["cumulative_count"] = df_counts["count"].cumsum() | |
df_counts["growth_rate"] = df_counts["count"].pct_change() | |
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth") | |
fig.update_layout( | |
xaxis_title="Month", | |
yaxis_title="Cumulative Number of Datasets", | |
yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"), | |
yaxis2=dict( | |
title="Month-over-Month Growth Rate", | |
overlaying="y", | |
side="right", | |
tickformat=",.0%", | |
), | |
legend=dict( | |
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 | |
), | |
) | |
fig.add_scatter( | |
x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2" | |
) | |
fig.update_layout( | |
title={ | |
"text": f"Dataset Growth for {framework} datasets", | |
"y": 0.95, | |
"x": 0.5, | |
"xanchor": "center", | |
"yanchor": "top", | |
}, | |
title_font=dict(size=24), | |
annotations=[ | |
dict( | |
x=0.5, | |
y=0.85, | |
xref="paper", | |
yref="paper", | |
text="Cumulative number of datasets and month-over-month growth rate", | |
showarrow=False, | |
font=dict(size=14), | |
) | |
], | |
) | |
return fig | |
def update_dashboard(framework): | |
data, grouped = fetch_data(framework) | |
dashboard = generate_dashboard(data, grouped, framework) | |
fig = plot_datasets_growth(data, framework) | |
return fig, dashboard | |
with gr.Blocks() as demo: | |
framework = gr.Dropdown( | |
choices=["distilabel", "sentence-transformers", "synthetic"], | |
allow_custom_value=True, | |
) | |
plot = gr.Plot() | |
markdown = gr.Markdown() | |
framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown]) | |
demo.launch() | |