Spaces:

taishi-i
/

awesome-japanese-nlp-resources-dashboard

Running

App Files Files Community

taishi-i commited on Aug 18

Commit

2ffaad6

•

1 Parent(s): 1db9748

update app.py

Browse files

Files changed (2) hide show

app.py +282 -226
requirements.txt +5 -0

app.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import json
 import altair as alt
 import pandas as pd
 import streamlit as st
 def read_json(file_name):
@@ -11,256 +16,307 @@ def read_json(file_name):
     return json_data
-# Load a json file
-json_file = "awesome-japanese-nlp-resources-search.json"
-json_data = read_json(json_file)
-df = pd.DataFrame(json_data)
-# Sorted by selected columns
-df = df[
-    [
-        "project_name",
-        "description",
-        "url",
-        "stargazers_count",
-        "downloads",
-        "source",
-        "score",
-        "first_commit",
-        "latest_commit",
-        "languages",
-        "model_or_dataset",
-    ]
-]
-df = df.sort_values(by="score", ascending=False)
-# Convert DataFrame for Dashboard
-df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
-df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
-df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
-df = df[df["first_commit"] >= "2009-01-01"]
-df = df[df["latest_commit"] >= "2009-01-01"]
-df["str_languages"] = df["languages"].apply(
-    lambda x: ",".join(x) if isinstance(x, list) else str(x)
-)
-df["year"] = df["first_commit"].dt.year
-# Set streamlit page settings
-title = "Awesome Japanese NLP resources Dashboard"
-icon = "🔎"
-st.set_page_config(
-    page_title=title,
-    page_icon=icon,
-    layout="wide",
-    initial_sidebar_state="expanded",
-)
-# Main streamlit page (sidebar)
-alt.themes.enable("dark")
-with st.sidebar:
-    st.title(f"{title} {icon}")
-    st.markdown(
-        "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
-    )
-    query = st.text_input(label="Search keyword")
-    source_type = ["GitHub", "Hugging Face"]
-    selected_source_type = st.selectbox(
-        "Choose a source type: GitHub or Hugging Face", source_type
     )
-    # Filtering GitHub or Hugging Face
-    df = df[df["source"] == selected_source_type]
-    if selected_source_type == "GitHub":
-        selected_model_or_dataset = None
-        all_languages = (
-            df["languages"]
-            .dropna()
-            .apply(lambda x: x if isinstance(x, list) else [])
-            .explode()
-            .unique()
-        )
-        all_languages = [""] + all_languages.tolist()
-        selected_languges = st.selectbox(
-            "Choose a programming language", all_languages, index=0
-        )
-        min_stars = int(df["stargazers_count"].min())
-        max_stars = int(df["stargazers_count"].max())
-        stars_range = st.slider(
-            "Choose the range for the stargazer count",
-            min_value=min_stars,
-            max_value=max_stars,
-            value=(min_stars, max_stars),
-        )
-    else:
-        selected_languges = None
-        selected_model_or_dataset = st.selectbox(
-            "Choose a model or a dataset", ["", "model", "dataset"], index=0
-        )
-        min_downloads = int(df["downloads"].min())
-        max_downloads = int(df["downloads"].max())
-        downloads_range = st.slider(
-            "Choose the range for the number of downloads",
-            min_value=min_downloads,
-            max_value=max_downloads,
-            value=(min_downloads, max_downloads),
-        )
-    min_activity_period = int(df["activity_period"].min())
-    max_activity_period = int(df["activity_period"].max())
-    activity_period_range = st.slider(
-        "Select the range for activity periods (in days)",
-        min_value=min_activity_period,
-        max_value=max_activity_period,
-        value=(min_activity_period, max_activity_period),
-    )
-    years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
-    selected_year_range = st.slider(
-        "Select a range for the years of the first commit",
-        min_value=min(years),
-        max_value=max(years),
-        value=(min(years), max(years)),
     )
-df = df[
-    (df["year"] >= selected_year_range[0])
-    & (df["year"] <= selected_year_range[1])
-]
-if selected_source_type == "GitHub":
-    df = df[
-        (df["stargazers_count"] >= stars_range[0])
-        & (df["stargazers_count"] <= stars_range[1])
-    ]
-else:
     df = df[
-        (df["downloads"] >= downloads_range[0])
-        & (df["downloads"] <= downloads_range[1])
     ]
-df = df[
-    (df["activity_period"] >= activity_period_range[0])
-    & (df["activity_period"] <= activity_period_range[1])
-]
-contained_description = df["description"].str.contains(
-    query, case=False, na=False
-)
-contained_project_name = df["project_name"].str.contains(
-    query, case=False, na=False
-)
-df = df[contained_description | contained_project_name]
-if selected_languges:
-    df = df[
-        df["str_languages"].str.contains(
-            selected_languges, case=False, na=False
-        )
-    ]
-if selected_model_or_dataset:
     df = df[
-        df["model_or_dataset"].str.contains(
-            selected_model_or_dataset, case=False, na=False
-        )
     ]
-# Main streamlit page (columns)
-col1, col2 = st.columns(2, gap="large")
-with col1:
-    st.markdown("### DataFrame")
-    st.markdown(f"#### Number of repositories: {len(df)}")
-    st.dataframe(df, height=600)
-    projects_per_year = (
-        df.groupby("year").size().reset_index(name="project_count")
     )
-    chart = (
-        alt.Chart(projects_per_year)
-        .mark_bar()
-        .encode(
-            x=alt.X("year:O", title="Year"),
-            y=alt.Y("project_count:Q", title="Number of repositories"),
-            tooltip=["year", "project_count"],
-        )
-        .properties(
-            title="Number of projects per year based on the uear of the first commit",
-            width=600,
-            height=400,
-        )
     )
-    st.altair_chart(chart, use_container_width=True)
-with col2:
-    if selected_source_type == "GitHub":
-        vs_type = "stargazers_count"
-    else:
-        vs_type = "downloads"
-    st.markdown(f"### First commit vs {vs_type}")
-    chart = (
-        alt.Chart(df)
-        .mark_circle(size=60)
-        .encode(
-            x="first_commit:T",
-            y=f"{vs_type}:Q",
-            tooltip=["first_commit", "project_name", f"{vs_type}"],
         )
-        .properties(
-            title=f"Relationship between first commit date and {vs_type}",
         )
-        .interactive()
-    )
-    st.altair_chart(chart, use_container_width=True)
-    st.markdown(f"### Latest commit vs {vs_type}")
-    chart = (
-        alt.Chart(df)
-        .mark_circle(size=60)
-        .encode(
-            x="latest_commit:T",
-            y=f"{vs_type}:Q",
-            tooltip=["project_name", "latest_commit", f"{vs_type}"],
-        )
-        .properties(
-            title=f"Relationship between latest commit date and {vs_type}",
         )
-        .interactive()
-    )
-    st.altair_chart(chart, use_container_width=True)
-    st.markdown(f"### Activity period vs {vs_type}")
-    chart = (
-        alt.Chart(df)
-        .mark_circle(size=60)
-        .encode(
-            x=alt.X("activity_period:Q", title="Activity Period (Days)"),
-            y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
-            tooltip=[
-                "project_name",
-                "activity_period",
-                f"{vs_type}",
-            ],
         )
-        .properties(
-            title=f"Relationship between activity period and {vs_type}",
         )
-        .interactive()
-    )
-    st.altair_chart(chart, use_container_width=True)

 import json
 import altair as alt
+import japanize_matplotlib
+import matplotlib.pyplot as plt
+import nagisa
 import pandas as pd
 import streamlit as st
+from datasets import load_dataset
+from wordcloud import WordCloud
 def read_json(file_name):
     return json_data
+@st.cache_data
+def convert_to_dataframe():
+    # Load a json file
+    json_file = "awesome-japanese-nlp-resources-search.json"
+    json_data = read_json(json_file)
+    df = pd.DataFrame(json_data)
+    # Sorted by selected columns
+    df = df[
+        [
+            "project_name",
+            "description",
+            "url",
+            "stargazers_count",
+            "downloads",
+            "source",
+            "score",
+            "first_commit",
+            "latest_commit",
+            "languages",
+            "model_or_dataset",
+        ]
+    ]
+    df = df.sort_values(by="score", ascending=False)
+    # Convert DataFrame for Dashboard
+    df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
+    df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
+    df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
+    df = df[df["first_commit"] >= "2009-01-01"]
+    df = df[df["latest_commit"] >= "2009-01-01"]
+    df["str_languages"] = df["languages"].apply(
+        lambda x: ",".join(x) if isinstance(x, list) else str(x)
     )
+    df["year"] = df["first_commit"].dt.year
+    dataset = load_dataset("taishi-i/nagisa_stopwords")
+    stopwords = dataset["nagisa_stopwords"]["words"]
+    def tokenize_description(description):
+        tokens = nagisa.filter(description, filter_postags=["助詞", "助動詞"])
+        words = tokens.words
+        words = [word for word in words if len(word.strip()) > 0]
+        words = [word for word in words if word not in stopwords]
+        words = " ".join(words)
+        return words
+    df["tokenized_description"] = df["description"].apply(tokenize_description)
+    return df
+def main():
+    # Set streamlit page settings
+    title = "Awesome Japanese NLP resources Dashboard"
+    icon = "🔎"
+    st.set_page_config(
+        page_title=title,
+        page_icon=icon,
+        layout="wide",
+        initial_sidebar_state="expanded",
     )
+    df = convert_to_dataframe()
+    # Main streamlit page (sidebar)
+    alt.themes.enable("dark")
+    with st.sidebar:
+        st.title(f"{title} {icon}")
+        st.markdown(
+            "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
+        )
+        query = st.text_input(label="Search keyword")
+        source_type = ["GitHub", "Hugging Face"]
+        selected_source_type = st.selectbox(
+            "Choose a source type: GitHub or Hugging Face", source_type
+        )
+        # Filtering GitHub or Hugging Face
+        df = df[df["source"] == selected_source_type]
+        if selected_source_type == "GitHub":
+            selected_model_or_dataset = None
+            all_languages = (
+                df["languages"]
+                .dropna()
+                .apply(lambda x: x if isinstance(x, list) else [])
+                .explode()
+                .unique()
+            )
+            all_languages = [""] + all_languages.tolist()
+            selected_languges = st.selectbox(
+                "Choose a programming language", all_languages, index=0
+            )
+            min_stars = int(df["stargazers_count"].min())
+            max_stars = int(df["stargazers_count"].max())
+            stars_range = st.slider(
+                "Choose the range for the stargazer count",
+                min_value=min_stars,
+                max_value=max_stars,
+                value=(min_stars, max_stars),
+            )
+        else:
+            selected_languges = None
+            selected_model_or_dataset = st.selectbox(
+                "Choose a model or a dataset",
+                ["", "model", "dataset"],
+                index=0,
+            )
+            min_downloads = int(df["downloads"].min())
+            max_downloads = int(df["downloads"].max())
+            downloads_range = st.slider(
+                "Choose the range for the number of downloads",
+                min_value=min_downloads,
+                max_value=max_downloads,
+                value=(min_downloads, max_downloads),
+            )
+        min_activity_period = int(df["activity_period"].min())
+        max_activity_period = int(df["activity_period"].max())
+        activity_period_range = st.slider(
+            "Select the range for activity periods (in days)",
+            min_value=min_activity_period,
+            max_value=max_activity_period,
+            value=(min_activity_period, max_activity_period),
+        )
+        years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
+        selected_year_range = st.slider(
+            "Select a range for the years of the first commit",
+            min_value=min(years),
+            max_value=max(years),
+            value=(min(years), max(years)),
+        )
     df = df[
+        (df["year"] >= selected_year_range[0])
+        & (df["year"] <= selected_year_range[1])
     ]
+    if selected_source_type == "GitHub":
+        df = df[
+            (df["stargazers_count"] >= stars_range[0])
+            & (df["stargazers_count"] <= stars_range[1])
+        ]
+    else:
+        df = df[
+            (df["downloads"] >= downloads_range[0])
+            & (df["downloads"] <= downloads_range[1])
+        ]
     df = df[
+        (df["activity_period"] >= activity_period_range[0])
+        & (df["activity_period"] <= activity_period_range[1])
     ]
+    contained_description = df["description"].str.contains(
+        query, case=False, na=False
     )
+    contained_project_name = df["project_name"].str.contains(
+        query, case=False, na=False
     )
+    df = df[contained_description | contained_project_name]
+    if selected_languges:
+        df = df[
+            df["str_languages"].str.contains(
+                selected_languges, case=False, na=False
+            )
+        ]
+    if selected_model_or_dataset:
+        df = df[
+            df["model_or_dataset"].str.contains(
+                selected_model_or_dataset, case=False, na=False
+            )
+        ]
+    # Main streamlit page (columns)
+    col1, col2 = st.columns(2, gap="large")
+    with col1:
+        st.markdown("### DataFrame")
+        st.markdown(f"#### Number of repositories: {len(df)}")
+        if selected_source_type == "GitHub":
+            stats_key = "stargazers_count"
+        else:
+            stats_key = "downloads"
+        if len(df) > 0:
+            mean_value = int(df[stats_key].mean())
+            min_value = int(df[stats_key].min())
+            max_value = int(df[stats_key].max())
+            st.markdown(
+                f"#### {stats_key} mean: {int(mean_value)}, min: {min_value}, max: {max_value}"
+            )
+        st.dataframe(df, height=600)
+        if len(df) > 0:
+            st.markdown("### Word Cloud")
+            descriptions = df["tokenized_description"].tolist()
+            combined_text = " ".join(descriptions)
+            wordcloud = WordCloud(
+                width=800,
+                height=400,
+                font_path=japanize_matplotlib.get_font_ttf_path(),
+                max_words=50,
+                colormap="PuBu",
+            ).generate(combined_text)
+            fig, ax = plt.subplots()
+            ax.imshow(wordcloud, interpolation="bilinear")
+            ax.axis("off")
+            st.pyplot(fig, use_container_width=True)
+    with col2:
+        if selected_source_type == "GitHub":
+            vs_type = "stargazers_count"
+        else:
+            vs_type = "downloads"
+        st.markdown(f"### First commit vs {vs_type}")
+        chart = (
+            alt.Chart(df)
+            .mark_circle(size=60)
+            .encode(
+                x="first_commit:T",
+                y=f"{vs_type}:Q",
+                tooltip=["first_commit", "project_name", f"{vs_type}"],
+            )
+            .properties(
+                title=f"Relationship between first commit date and {vs_type}",
+            )
+            .interactive()
         )
+        st.altair_chart(chart, use_container_width=True)
+        st.markdown(f"### Latest commit vs {vs_type}")
+        chart = (
+            alt.Chart(df)
+            .mark_circle(size=60)
+            .encode(
+                x="latest_commit:T",
+                y=f"{vs_type}:Q",
+                tooltip=["project_name", "latest_commit", f"{vs_type}"],
+            )
+            .properties(
+                title=f"Relationship between latest commit date and {vs_type}",
+            )
+            .interactive()
         )
+        st.altair_chart(chart, use_container_width=True)
+        st.markdown(f"### Activity period vs {vs_type}")
+        chart = (
+            alt.Chart(df)
+            .mark_circle(size=60)
+            .encode(
+                x=alt.X("activity_period:Q", title="Activity Period (Days)"),
+                y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
+                tooltip=[
+                    "project_name",
+                    "activity_period",
+                    f"{vs_type}",
+                ],
+            )
+            .properties(
+                title=f"Relationship between activity period and {vs_type}",
+            )
+            .interactive()
         )
+        st.altair_chart(chart, use_container_width=True)
+        projects_per_year = (
+            df.groupby("year").size().reset_index(name="project_count")
         )
+        chart = (
+            alt.Chart(projects_per_year)
+            .mark_bar()
+            .encode(
+                x=alt.X("year:O", title="Year"),
+                y=alt.Y("project_count:Q", title="Number of repositories"),
+                tooltip=["year", "project_count"],
+            )
+            .properties(
+                title="Number of projects per year based on the uear of the first commit",
+                width=600,
+                height=400,
+            )
         )
+        st.altair_chart(chart, use_container_width=True)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -2,3 +2,8 @@ streamlit
 pandas
 altair
 plotly

 pandas
 altair
 plotly
+matplotlib
+nagisa
+datasets
+wordcloud
+japanize_matplotlib