Spaces:

nielsr
/

community-science-progress

Sleeping

File size: 7,196 Bytes

57c87c9
 
 
 
9325c4d
404478b
57c87c9
2adbdb9
1396667
57c87c9
 
404478b
 
 
 
 
 
 
 
4dd059d
404478b
 
 
 
 
 
3170ddb
 
1396667
9325c4d
 
 
1396667
3170ddb
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2adbdb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3170ddb
 
 
 
 
9325c4d
3170ddb
9325c4d
404478b
 
3170ddb
 
 
 
 
404478b
b58eec2
 
2adbdb9
 
b58eec2
2adbdb9
4dd059d
 
2adbdb9
404478b
 
57c87c9
b58eec2
57c87c9
570845b
 
 
 
404478b
570845b
 
 
 
 
2adbdb9
 
 
570845b
 
 
 
 
2adbdb9
570845b
b58eec2
2adbdb9
570845b
 
2adbdb9
 
 
570845b
404478b
 
 
 
 
 
2adbdb9
570845b
404478b
570845b
2adbdb9
570845b
 
2adbdb9
 
 
570845b
404478b
 
 
 
 
 
 
 
 
 
 
 
 
2adbdb9
570845b
b58eec2
570845b
2adbdb9
570845b
 
2adbdb9
 
 
 
404478b
 
570845b
 
 
57c87c9

from datetime import datetime

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datasets import Dataset
from load_dataframe import get_data


def aggregated_data(df, aggregation_level="week"):

    st.write(f"Aggregated data by {aggregation_level}")

    # Create a column that indicates if a paper has any artifacts
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)

    # Resample by week
    freq = 'W' if aggregation_level == "week" else 'ME'
    weekly_total_papers = df.resample(freq).size()
    weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

    # Calculate the percentage of papers with artifacts
    percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

    # Calculate the growth rate
    growth_rate = percentage_papers_with_artifacts.pct_change() * 100
    growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()

    # Display the average growth rate as a big number
    average_growth_rate = growth_rate.mean()
    st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

    # Set the y-axis limits
    plt.ylim(0, 100)
    
    plt.xlabel(aggregation_level)
    plt.ylabel('Percentage')
    plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
    plt.legend()
    plt.grid(True)

    # Use Streamlit to display the plot
    st.pyplot(plt)


def show_data_editor(df: pd.DataFrame, key: str):
    edited_df = st.data_editor(df,
                hide_index=True,
                column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn(),
                                "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
                width=2000,
                key=key)
    
    # Check if the dataframe has been edited
    # TODO this is wrong
    # rather we should probably do a merge-join (overwriting the edited rows) and then save the new dataframe
    # if not edited_df.equals(df):
    #     save_data(edited_df)
    #     st.success("Changes saved successfully!")


def save_data(df: pd.DataFrame):
    # load as HF dataset
    dataset = Dataset.from_pandas(df)

    dataset.push_to_hub("nielsr/daily-papers-enriched")

    return


def display_data(df: pd.DataFrame):
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
    num_artifacts = df['has_artifact'].sum()
    percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
    percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

    # add reached out and reached out link columns
    df['reached_out'] = [False for _ in range(df.shape[0])]
    df["reached_out_link"] = ["" for _ in range(df.shape[0])]

    st.markdown(f"""
    ## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
    
    * Number of papers: {df.shape[0]}
    * Number of papers with a Github link: {df['github'].notnull().sum()}
    * Number of papers with at least one HF artifact: {num_artifacts}
    """)

    st.write("Papers with at least one artifact")
    show_data_editor(df[df['has_artifact']], key="papers_with_artifacts")

    st.write("Papers without artifacts")
    show_data_editor(df[~df['has_artifact']], key="papers_without_artifacts")
    
    st.write("Papers with a HF mention in README but no artifacts")
    show_data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])], key="papers_with_hf_mention_no_artifacts")


def main():
    st.title("Hugging Face Artifacts KPI Dashboard")

    # 2 tabs: one for daily data, one for weekly data
    st.sidebar.title("Navigation")
    selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

    if selection == "Daily/weekly/monthly data":
        # Button to select day, month or week
        # Add streamlit selectbox.
        view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

        if view_level == "day":
            # get the latest dataframe
            df = get_data()
            
            # make a button to select the day, defaulting to today
            day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
            # convert to the day of a Pandas Timestamp
            day = pd.Timestamp(day)

            filtered_df = df[df.index.date == day.date()]

            st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
            display_data(df=filtered_df)

        elif view_level == "week":
            # get the latest dataframe
            df = get_data()
            
            # make a button to select the week
            week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
            
            # Extract week number from the index
            df['week'] = df.index.isocalendar().week

            # Filter the dataframe for the desired week number
            filtered_df = df[df['week'] == week_number]
            
            st.write(f"Showing data for week {week_number}")
            
            display_data(df=filtered_df)

        elif view_level == "month":
            # get the latest dataframe
            df = get_data()
            
            # make a button to select the month, defaulting to current month
            month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
            year_str = st.selectbox("Select year", options=["2024"])
            
            # Filter the dataframe for the desired week number
            month_map = {
                'January': 1, 'February': 2, 'March': 3, 'April': 4, 
                'May': 5, 'June': 6, 'July': 7, 'August': 8, 
                'September': 9, 'October': 10, 'November': 11, 'December': 12
            }
            
            # Convert month string to number
            month = month_map[month_str]
            year = int(year_str)
            filtered_df = df[(df.index.month == month) & (df.index.year == year)]
            
            st.write(f"Showing data for {month_str} {year_str}")
            
            display_data(df=filtered_df)

    elif selection == "Aggregated data":

        # get the latest dataframe
        df = get_data()

        aggregated_data(df)
        aggregated_data(df, aggregation_level="month")

    else:
        st.write("Error: selection not recognized")


if __name__ == "__main__":
    main()