Spaces:

taishi-i
/

awesome-japanese-nlp-resources-dashboard

Running

App Files Files Community

taishi-i commited on Aug 18

Commit

49c8013

•

1 Parent(s): 3c9e152

add application files

Browse files

Files changed (3) hide show

app.py +266 -0
awesome-japanese-nlp-resources-search.json +0 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import json
+import altair as alt
+import pandas as pd
+import streamlit as st
+def read_json(file_name):
+    with open(file_name, "r") as f:
+        json_data = json.load(f)
+    return json_data
+# Load a json file
+json_file = "awesome-japanese-nlp-resources-search.json"
+json_data = read_json(json_file)
+df = pd.DataFrame(json_data)
+# Sorted by selected columns
+df = df[
+    [
+        "project_name",
+        "description",
+        "url",
+        "stargazers_count",
+        "downloads",
+        "source",
+        "score",
+        "first_commit",
+        "latest_commit",
+        "languages",
+        "model_or_dataset",
+    ]
+]
+df = df.sort_values(by="score", ascending=False)
+# Convert DataFrame for Dashboard
+df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
+df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
+df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
+df = df[df["first_commit"] >= "2009-01-01"]
+df = df[df["latest_commit"] >= "2009-01-01"]
+df["str_languages"] = df["languages"].apply(
+    lambda x: ",".join(x) if isinstance(x, list) else str(x)
+)
+df["year"] = df["first_commit"].dt.year
+# Set streamlit page settings
+title = "Awesome Japanese NLP resources Dashboard"
+icon = "🔎"
+st.set_page_config(
+    page_title=title,
+    page_icon=icon,
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# Main streamlit page (sidebar)
+alt.themes.enable("dark")
+with st.sidebar:
+    st.title(f"{title} {icon}")
+    st.markdown(
+        "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
+    )
+    query = st.text_input(label="Search keyword")
+    source_type = ["GitHub", "Hugging Face"]
+    selected_source_type = st.selectbox(
+        "Choose a source type: GitHub or Hugging Face", source_type
+    )
+    # Filtering GitHub or Hugging Face
+    df = df[df["source"] == selected_source_type]
+    if selected_source_type == "GitHub":
+        selected_model_or_dataset = None
+        all_languages = (
+            df["languages"]
+            .dropna()
+            .apply(lambda x: x if isinstance(x, list) else [])
+            .explode()
+            .unique()
+        )
+        all_languages = [""] + all_languages.tolist()
+        selected_languges = st.selectbox(
+            "Choose a programming language", all_languages, index=0
+        )
+        min_stars = int(df["stargazers_count"].min())
+        max_stars = int(df["stargazers_count"].max())
+        stars_range = st.slider(
+            "Choose the range for the stargazer count",
+            min_value=min_stars,
+            max_value=max_stars,
+            value=(min_stars, max_stars),
+        )
+    else:
+        selected_languges = None
+        selected_model_or_dataset = st.selectbox(
+            "Choose a model or a dataset", ["", "model", "dataset"], index=0
+        )
+        min_downloads = int(df["downloads"].min())
+        max_downloads = int(df["downloads"].max())
+        downloads_range = st.slider(
+            "Choose the range for the number of downloads",
+            min_value=min_downloads,
+            max_value=max_downloads,
+            value=(min_downloads, max_downloads),
+        )
+    min_activity_period = int(df["activity_period"].min())
+    max_activity_period = int(df["activity_period"].max())
+    activity_period_range = st.slider(
+        "Select the range for activity periods (in days)",
+        min_value=min_activity_period,
+        max_value=max_activity_period,
+        value=(min_activity_period, max_activity_period),
+    )
+    years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
+    selected_year_range = st.slider(
+        "Select a range for the years of the first commit",
+        min_value=min(years),
+        max_value=max(years),
+        value=(min(years), max(years)),
+    )
+df = df[
+    (df["year"] >= selected_year_range[0])
+    & (df["year"] <= selected_year_range[1])
+]
+if selected_source_type == "GitHub":
+    df = df[
+        (df["stargazers_count"] >= stars_range[0])
+        & (df["stargazers_count"] <= stars_range[1])
+    ]
+else:
+    df = df[
+        (df["downloads"] >= downloads_range[0])
+        & (df["downloads"] <= downloads_range[1])
+    ]
+df = df[
+    (df["activity_period"] >= activity_period_range[0])
+    & (df["activity_period"] <= activity_period_range[1])
+]
+contained_description = df["description"].str.contains(
+    query, case=False, na=False
+)
+contained_project_name = df["project_name"].str.contains(
+    query, case=False, na=False
+)
+df = df[contained_description | contained_project_name]
+if selected_languges:
+    df = df[
+        df["str_languages"].str.contains(
+            selected_languges, case=False, na=False
+        )
+    ]
+if selected_model_or_dataset:
+    df = df[
+        df["model_or_dataset"].str.contains(
+            selected_model_or_dataset, case=False, na=False
+        )
+    ]
+# Main streamlit page (columns)
+col1, col2 = st.columns(2, gap="large")
+with col1:
+    st.markdown("### DataFrame")
+    st.markdown(f"#### Number of repositories: {len(df)}")
+    st.dataframe(df, height=600)
+    projects_per_year = (
+        df.groupby("year").size().reset_index(name="project_count")
+    )
+    chart = (
+        alt.Chart(projects_per_year)
+        .mark_bar()
+        .encode(
+            x=alt.X("year:O", title="Year"),
+            y=alt.Y("project_count:Q", title="Number of repositories"),
+            tooltip=["year", "project_count"],
+        )
+        .properties(
+            title="Number of projects per year based on the uear of the first commit",
+            width=600,
+            height=400,
+        )
+    )
+    st.altair_chart(chart, use_container_width=True)
+with col2:
+    if selected_source_type == "GitHub":
+        vs_type = "stargazers_count"
+    else:
+        vs_type = "downloads"
+    st.markdown(f"### First commit vs {vs_type}")
+    chart = (
+        alt.Chart(df)
+        .mark_circle(size=60)
+        .encode(
+            x="first_commit:T",
+            y=f"{vs_type}:Q",
+            tooltip=["first_commit", "project_name", f"{vs_type}"],
+        )
+        .properties(
+            title=f"Relationship between first commit date and {vs_type}",
+        )
+        .interactive()
+    )
+    st.altair_chart(chart, use_container_width=True)
+    st.markdown(f"### Latest commit vs {vs_type}")
+    chart = (
+        alt.Chart(df)
+        .mark_circle(size=60)
+        .encode(
+            x="latest_commit:T",
+            y=f"{vs_type}:Q",
+            tooltip=["project_name", "latest_commit", f"{vs_type}"],
+        )
+        .properties(
+            title=f"Relationship between latest commit date and {vs_type}",
+        )
+        .interactive()
+    )
+    st.altair_chart(chart, use_container_width=True)
+    st.markdown(f"### Activity period vs {vs_type}")
+    chart = (
+        alt.Chart(df)
+        .mark_circle(size=60)
+        .encode(
+            x=alt.X("activity_period:Q", title="Activity Period (Days)"),
+            y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
+            tooltip=[
+                "project_name",
+                "activity_period",
+                f"{vs_type}",
+            ],
+        )
+        .properties(
+            title=f"Relationship between activity period and {vs_type}",
+        )
+        .interactive()
+    )
+    st.altair_chart(chart, use_container_width=True)

awesome-japanese-nlp-resources-search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+pandas
+altair
+plotly