davanstrien's picture
davanstrien HF staff
chore: Update plot_datasets_growth to include growth rate option
2b87392
import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd
from functools import lru_cache
choices = sorted(
[
"art",
"biology",
"code",
"distilabel",
"fiftyone",
"legal",
"medical",
"sentence-transformers",
"synthetic",
]
)
@lru_cache(maxsize=100)
def fetch_data(framework):
r = httpx.get(f"https://huggingface.co./api/datasets?filter={framework}")
data = r.json()
grouped = groupby(lambda x: x["author"], data)
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
return data, grouped
def generate_dashboard(data, grouped, framework):
total_datasets = sum(len(v) for v in grouped.values())
dashboard = f"## Hugging Face datasets for {framework} \n\n"
dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
dashboard += "### Datasets per Author\n\n"
for k, v in grouped.items():
dashboard += f"- **Author:** [{k}](https://huggingface.co./{k})\n"
dashboard += f" - **Number of datasets:** {len(v)}\n"
return dashboard
def plot_datasets_growth(data, framework, show_growth_rate=True):
df = pd.DataFrame(data)
df["createdAt"] = pd.to_datetime(df["createdAt"])
df["month"] = df["createdAt"].dt.to_period("M").astype(str)
# Exclude the current month
current_month = pd.Period.now("M").strftime("%Y-%m")
df = df[df["month"] < current_month]
df_counts = df.groupby("month").size().reset_index(name="count")
df_counts["cumulative_count"] = df_counts["count"].cumsum()
df_counts["growth_rate"] = df_counts["count"].pct_change()
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
fig.update_layout(
xaxis_title="Month",
yaxis_title="Cumulative Number of Datasets",
yaxis=dict(title=f"Cumulative Number of Datasets ({framework})"),
legend=dict(
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
),
)
if show_growth_rate:
fig.update_layout(
yaxis2=dict(
title="Month-over-Month Growth Rate",
overlaying="y",
side="right",
tickformat=",.0%",
)
)
fig.add_scatter(
x=df_counts["month"],
y=df_counts["growth_rate"],
name="Growth Rate",
yaxis="y2",
)
fig.update_layout(
title={
"text": f"Dataset Growth for {framework} datasets",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
title_font=dict(size=24),
annotations=[
dict(
x=0.5,
y=0.85,
xref="paper",
yref="paper",
text="Cumulative number of datasets"
+ (" and month-over-month growth rate" if show_growth_rate else ""),
showarrow=False,
font=dict(size=14),
)
],
)
return fig
def update_dashboard(framework, show_growth_rate=True):
data, grouped = fetch_data(framework)
dashboard = generate_dashboard(data, grouped, framework)
fig = plot_datasets_growth(data, framework, show_growth_rate)
return fig, dashboard
with gr.Blocks() as demo:
gr.Markdown("# Dataset frameworks/tags on the Hub")
gr.Markdown(
"This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag."
)
framework = gr.Dropdown(
choices=choices,
allow_custom_value=True,
label="Select a framework/tag",
)
show_growth_rate = gr.Checkbox(True, label="Show growth rate")
plot = gr.Plot(label="Growth of datasets over time")
markdown = gr.Markdown(label="summary")
framework.change(
update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown]
)
show_growth_rate.change(
update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown]
)
demo.launch()