Spaces:

taishi-i
/

awesome-japanese-nlp-resources-dashboard

Running

awesome-japanese-nlp-resources-dashboard

File size: 14,817 Bytes

import json
from collections import Counter

import altair as alt
import japanize_matplotlib
import matplotlib.pyplot as plt
import nagisa
import pandas as pd
import streamlit as st
from datasets import load_dataset
from wordcloud import WordCloud


def read_json(file_name):
    with open(file_name, "r") as f:
        json_data = json.load(f)
    return json_data


@st.cache_data
def convert_to_dataframe():
    # Load a json file
    json_file = "awesome-japanese-nlp-resources-search.json"
    json_data = read_json(json_file)
    df = pd.DataFrame(json_data)

    # Sorted by selected columns
    df = df[
        [
            "project_name",
            "description",
            "url",
            "stargazers_count",
            "downloads",
            "model_architectures",
            "source",
            "score",
            "first_commit",
            "latest_commit",
            "languages",
            "model_or_dataset",
            "model_size",
        ]
    ]
    df = df.sort_values(by="score", ascending=False)

    # Convert DataFrame for Dashboard
    df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
    df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
    df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
    df = df[df["first_commit"] >= "2009-01-01"]
    df = df[df["latest_commit"] >= "2009-01-01"]
    df["str_languages"] = df["languages"].apply(
        lambda x: ",".join(x) if isinstance(x, list) else str(x)
    )
    df["year"] = df["first_commit"].dt.year

    dataset = load_dataset("taishi-i/nagisa_stopwords")
    stopwords = dataset["nagisa_stopwords"]["words"]

    def tokenize_description(description):
        description = description.lower()
        tokens = nagisa.filter(description, filter_postags=["助詞", "助動詞"])
        words = tokens.words
        words = [word for word in words if len(word.strip()) > 0]
        words = [word for word in words if word not in stopwords]
        words = " ".join(words)
        return words

    df["tokenized_description"] = df["description"].apply(tokenize_description)
    return df


def main():
    # Set streamlit page settings
    title = "Awesome Japanese NLP resources Dashboard"
    icon = "🔎"

    st.set_page_config(
        page_title=title,
        page_icon=icon,
        layout="wide",
        initial_sidebar_state="expanded",
    )
    df = convert_to_dataframe()

    # Main streamlit page (sidebar)
    alt.themes.enable("dark")
    with st.sidebar:
        st.title(f"{title} {icon}")
        st.markdown(
            "You can search for open-source software from [1250+ Japanese NLP"
            " repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
        )

        query = st.text_input(label="Search keyword")

        # source_type = ["GitHub", "Hugging Face"]
        source_type = ["Hugging Face", "GitHub"]
        selected_source_type = st.selectbox(
            "Choose a source type: Hugging Face or GitHub", source_type
        )

        # Filtering GitHub or Hugging Face
        df = df[df["source"] == selected_source_type]

        if selected_source_type == "GitHub":
            selected_model_or_dataset = None
            all_languages = (
                df["languages"]
                .dropna()
                .apply(lambda x: x if isinstance(x, list) else [])
                .explode()
                .unique()
            )
            all_languages = [""] + all_languages.tolist()
            selected_languges = st.selectbox(
                "Choose a programming language", all_languages, index=0
            )

            min_stars = int(df["stargazers_count"].min())
            max_stars = int(df["stargazers_count"].max())

            stars_range = st.slider(
                "Choose the range for the stargazer count",
                min_value=min_stars,
                max_value=max_stars,
                value=(min_stars, max_stars),
            )
        else:
            selected_languges = None
            selected_model_or_dataset = st.selectbox(
                "Choose a model or a dataset",
                ["", "model", "dataset"],
                index=0,
            )

            model_architectures = df["model_architectures"].tolist()
            model_architectures_counts = Counter(model_architectures)
            del model_architectures_counts[None]

            model_architectures_counts = sorted(
                model_architectures_counts.items(),
                key=lambda x: x[1],
                reverse=True,
            )
            model_architectures_list = [""] + [
                model_and_count[0]
                for model_and_count in model_architectures_counts
            ]

            selected_model_architecture = st.selectbox(
                "Choose a model architecture",
                model_architectures_list,
                index=0,
            )

            min_downloads = int(df["downloads"].min())
            max_downloads = int(df["downloads"].max())

            downloads_range = st.slider(
                "Choose the range for the number of downloads",
                min_value=min_downloads,
                max_value=max_downloads,
                value=(min_downloads, max_downloads),
            )

            min_model_size = int(df["model_size"].min())
            max_model_size = int(df["model_size"].max())

            model_size_range = st.slider(
                "Choose the range for the model size (billion)",
                min_value=min_model_size,
                max_value=max_model_size,
                value=(min_model_size, max_model_size),
            )

        min_activity_period = int(df["activity_period"].min())
        max_activity_period = int(df["activity_period"].max())

        activity_period_range = st.slider(
            "Select the range for activity periods (in days)",
            min_value=min_activity_period,
            max_value=max_activity_period,
            value=(min_activity_period, max_activity_period),
        )
        years = sorted(list(set(df["year"].dropna().astype(int).tolist())))

        selected_year_range = st.slider(
            "Select a range for the years of the first commit",
            min_value=min(years),
            max_value=max(years),
            value=(min(years), max(years)),
        )

        st.markdown("Sorted by")
        num_show_repos = st.number_input(
            "Number of sorted repositories", value=15
        )
        latest_repos_btn = st.button("Latest repositories")
        trend_repos_btn = st.button("Trend repositories")

    df = df[
        (df["year"] >= selected_year_range[0])
        & (df["year"] <= selected_year_range[1])
    ]

    if selected_source_type == "GitHub":
        df = df[
            (df["stargazers_count"] >= stars_range[0])
            & (df["stargazers_count"] <= stars_range[1])
        ]
    else:
        if (
            downloads_range[0] > min_downloads
            or downloads_range[1] < max_downloads
        ):
            df = df[
                (df["downloads"] >= downloads_range[0])
                & (df["downloads"] <= downloads_range[1])
            ]

        if (
            model_size_range[0] > min_model_size
            or model_size_range[1] < max_model_size
        ):
            df = df[
                (df["model_size"] >= model_size_range[0])
                & (df["model_size"] <= model_size_range[1])
            ]

    df = df[
        (df["activity_period"] >= activity_period_range[0])
        & (df["activity_period"] <= activity_period_range[1])
    ]

    contained_description = df["description"].str.contains(
        query, case=False, na=False
    )
    contained_project_name = df["project_name"].str.contains(
        query, case=False, na=False
    )
    contained_model_arch = df["model_architectures"].str.contains(
        query, case=False, na=False
    )

    df = df[
        contained_description | contained_project_name | contained_model_arch
    ]

    if selected_languges:
        df = df[
            df["str_languages"].str.contains(
                selected_languges, case=False, na=False
            )
        ]

    if selected_model_or_dataset:
        df = df[
            df["model_or_dataset"].str.contains(
                selected_model_or_dataset, case=False, na=False
            )
        ]

    if selected_model_architecture:
        contained_model_arch = df["model_architectures"].str.contains(
            selected_model_architecture, case=False, na=False
        )
        df = df[contained_model_arch]

    if latest_repos_btn:
        df = df.sort_values(by="first_commit", ascending=False)
        df = df[:num_show_repos]

    if trend_repos_btn:
        if selected_source_type == "GitHub":
            stats_key = "stargazers_count"
        else:
            stats_key = "downloads"
        df = df.sort_values(by=stats_key, ascending=False)
        df = df[:num_show_repos]

    # Main streamlit page (columns)
    col1, col2 = st.columns(2, gap="large")

    with col1:
        st.markdown("### DataFrame")
        st.markdown(f"#### Number of repositories: {len(df)}")
        if selected_source_type == "GitHub":
            stats_key = "stargazers_count"
        else:
            stats_key = "downloads"

        if len(df) > 0:
            mean_value = int(df[stats_key].mean())
            min_value = int(df[stats_key].min())
            max_value = int(df[stats_key].max())
            st.markdown(
                f"#### {stats_key} mean: {int(mean_value)}, min: {min_value},"
                f" max: {max_value}"
            )

        st.dataframe(df, height=600)

        if len(df) > 0:
            st.markdown("### Word Cloud")
            descriptions = df["tokenized_description"].tolist()
            combined_text = " ".join(descriptions)

            wordcloud = WordCloud(
                width=800,
                height=400,
                font_path=japanize_matplotlib.get_font_ttf_path(),
                max_words=50,
                colormap="PuBu",
            ).generate(combined_text)

            fig, ax = plt.subplots()
            ax.imshow(wordcloud, interpolation="bilinear")
            ax.axis("off")
            st.pyplot(fig, use_container_width=True)

        if selected_source_type == "GitHub":
            all_languages = [
                language
                for languages_list in df["languages"]
                for language in languages_list
            ]
            language_counts = Counter(all_languages)
            language_df = pd.DataFrame(
                language_counts.items(), columns=["Language", "Count"]
            )
            language_df = language_df.sort_values(by="Count", ascending=False)

            st.markdown("### Language Usage Table")
            st.dataframe(language_df)
        else:
            st.markdown("### Model size vs downloads")
            chart = (
                alt.Chart(df)
                .mark_circle(size=60)
                .encode(
                    x="model_size",
                    y="downloads",
                    tooltip=["project_name", "model_size", "downloads"],
                )
                .properties(
                    title=(
                        "Relationship between model size (Billion) and"
                        " downloads"
                    ),
                )
                .interactive()
            )
            st.altair_chart(chart, use_container_width=True)

            model_architectures = df["model_architectures"].tolist()
            model_architectures_counts = Counter(model_architectures)
            del model_architectures_counts[None]
            model_architectures_df = pd.DataFrame(
                model_architectures_counts.items(),
                columns=["Model_architectures", "Count"],
            )
            model_architectures_df = model_architectures_df.sort_values(
                by="Count", ascending=False
            )

            st.markdown("### Model Architecture Table")
            st.dataframe(model_architectures_df)

    with col2:
        if selected_source_type == "GitHub":
            vs_type = "stargazers_count"
        else:
            vs_type = "downloads"

        st.markdown(f"### First commit vs {vs_type}")
        chart = (
            alt.Chart(df)
            .mark_circle(size=60)
            .encode(
                x="first_commit:T",
                y=f"{vs_type}:Q",
                tooltip=["first_commit", "project_name", f"{vs_type}"],
            )
            .properties(
                title=f"Relationship between first commit date and {vs_type}",
            )
            .interactive()
        )
        st.altair_chart(chart, use_container_width=True)

        st.markdown(f"### Latest commit vs {vs_type}")
        chart = (
            alt.Chart(df)
            .mark_circle(size=60)
            .encode(
                x="latest_commit:T",
                y=f"{vs_type}:Q",
                tooltip=["project_name", "latest_commit", f"{vs_type}"],
            )
            .properties(
                title=f"Relationship between latest commit date and {vs_type}",
            )
            .interactive()
        )
        st.altair_chart(chart, use_container_width=True)

        st.markdown(f"### Activity period vs {vs_type}")
        chart = (
            alt.Chart(df)
            .mark_circle(size=60)
            .encode(
                x=alt.X("activity_period:Q", title="Activity Period (Days)"),
                y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
                tooltip=[
                    "project_name",
                    "activity_period",
                    f"{vs_type}",
                ],
            )
            .properties(
                title=f"Relationship between activity period and {vs_type}",
            )
            .interactive()
        )
        st.altair_chart(chart, use_container_width=True)

        projects_per_year = (
            df.groupby("year").size().reset_index(name="project_count")
        )

        chart = (
            alt.Chart(projects_per_year)
            .mark_bar()
            .encode(
                x=alt.X("year:O", title="Year"),
                y=alt.Y("project_count:Q", title="Number of repositories"),
                tooltip=["year", "project_count"],
            )
            .properties(
                title=(
                    "Number of projects per year based on the uear of the"
                    " first commit"
                ),
                width=600,
                height=400,
            )
        )

        st.altair_chart(chart, use_container_width=True)


if __name__ == "__main__":
    main()