taishi-i's picture
update app.py
62ea5f3
raw
history blame
13.1 kB
import json
from collections import Counter
import altair as alt
import japanize_matplotlib
import matplotlib.pyplot as plt
import nagisa
import pandas as pd
import streamlit as st
from datasets import load_dataset
from wordcloud import WordCloud
def read_json(file_name):
with open(file_name, "r") as f:
json_data = json.load(f)
return json_data
@st.cache_data
def convert_to_dataframe():
# Load a json file
json_file = "awesome-japanese-nlp-resources-search.json"
json_data = read_json(json_file)
df = pd.DataFrame(json_data)
# Sorted by selected columns
df = df[
[
"project_name",
"description",
"url",
"stargazers_count",
"downloads",
"source",
"score",
"first_commit",
"latest_commit",
"languages",
"model_or_dataset",
"model_size",
]
]
df = df.sort_values(by="score", ascending=False)
# Convert DataFrame for Dashboard
df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
df = df[df["first_commit"] >= "2009-01-01"]
df = df[df["latest_commit"] >= "2009-01-01"]
df["str_languages"] = df["languages"].apply(
lambda x: ",".join(x) if isinstance(x, list) else str(x)
)
df["year"] = df["first_commit"].dt.year
dataset = load_dataset("taishi-i/nagisa_stopwords")
stopwords = dataset["nagisa_stopwords"]["words"]
def tokenize_description(description):
description = description.lower()
tokens = nagisa.filter(description, filter_postags=["助詞", "εŠ©ε‹•θ©ž"])
words = tokens.words
words = [word for word in words if len(word.strip()) > 0]
words = [word for word in words if word not in stopwords]
words = " ".join(words)
return words
df["tokenized_description"] = df["description"].apply(tokenize_description)
return df
def main():
# Set streamlit page settings
title = "Awesome Japanese NLP resources Dashboard"
icon = "πŸ”Ž"
st.set_page_config(
page_title=title,
page_icon=icon,
layout="wide",
initial_sidebar_state="expanded",
)
df = convert_to_dataframe()
# Main streamlit page (sidebar)
alt.themes.enable("dark")
with st.sidebar:
st.title(f"{title} {icon}")
st.markdown(
"You can search for open-source software from [1250+ Japanese NLP"
" repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
)
query = st.text_input(label="Search keyword")
# source_type = ["GitHub", "Hugging Face"]
source_type = ["Hugging Face", "GitHub"]
selected_source_type = st.selectbox(
"Choose a source type: Hugging Face or GitHub", source_type
)
# Filtering GitHub or Hugging Face
df = df[df["source"] == selected_source_type]
if selected_source_type == "GitHub":
selected_model_or_dataset = None
all_languages = (
df["languages"]
.dropna()
.apply(lambda x: x if isinstance(x, list) else [])
.explode()
.unique()
)
all_languages = [""] + all_languages.tolist()
selected_languges = st.selectbox(
"Choose a programming language", all_languages, index=0
)
min_stars = int(df["stargazers_count"].min())
max_stars = int(df["stargazers_count"].max())
stars_range = st.slider(
"Choose the range for the stargazer count",
min_value=min_stars,
max_value=max_stars,
value=(min_stars, max_stars),
)
else:
selected_languges = None
selected_model_or_dataset = st.selectbox(
"Choose a model or a dataset",
["", "model", "dataset"],
index=0,
)
min_downloads = int(df["downloads"].min())
max_downloads = int(df["downloads"].max())
downloads_range = st.slider(
"Choose the range for the number of downloads",
min_value=min_downloads,
max_value=max_downloads,
value=(min_downloads, max_downloads),
)
min_model_size = int(df["model_size"].min())
max_model_size = int(df["model_size"].max())
model_size_range = st.slider(
"Choose the range for the model size (billion)",
min_value=min_model_size,
max_value=max_model_size,
value=(min_model_size, max_model_size),
)
min_activity_period = int(df["activity_period"].min())
max_activity_period = int(df["activity_period"].max())
activity_period_range = st.slider(
"Select the range for activity periods (in days)",
min_value=min_activity_period,
max_value=max_activity_period,
value=(min_activity_period, max_activity_period),
)
years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
selected_year_range = st.slider(
"Select a range for the years of the first commit",
min_value=min(years),
max_value=max(years),
value=(min(years), max(years)),
)
st.markdown("Sorted by")
num_show_repos = st.number_input(
"Number of sorted repositories", value=15
)
latest_repos_btn = st.button("Latest repositories")
trend_repos_btn = st.button("Trend repositories")
df = df[
(df["year"] >= selected_year_range[0])
& (df["year"] <= selected_year_range[1])
]
if selected_source_type == "GitHub":
df = df[
(df["stargazers_count"] >= stars_range[0])
& (df["stargazers_count"] <= stars_range[1])
]
else:
if (
downloads_range[0] > min_downloads
or downloads_range[1] < max_downloads
):
df = df[
(df["downloads"] >= downloads_range[0])
& (df["downloads"] <= downloads_range[1])
]
if (
model_size_range[0] > min_model_size
or model_size_range[1] < max_model_size
):
df = df[
(df["model_size"] >= model_size_range[0])
& (df["model_size"] <= model_size_range[1])
]
df = df[
(df["activity_period"] >= activity_period_range[0])
& (df["activity_period"] <= activity_period_range[1])
]
contained_description = df["description"].str.contains(
query, case=False, na=False
)
contained_project_name = df["project_name"].str.contains(
query, case=False, na=False
)
df = df[contained_description | contained_project_name]
if selected_languges:
df = df[
df["str_languages"].str.contains(
selected_languges, case=False, na=False
)
]
if selected_model_or_dataset:
df = df[
df["model_or_dataset"].str.contains(
selected_model_or_dataset, case=False, na=False
)
]
if latest_repos_btn:
df = df.sort_values(by="first_commit", ascending=False)
df = df[:num_show_repos]
if trend_repos_btn:
if selected_source_type == "GitHub":
stats_key = "stargazers_count"
else:
stats_key = "downloads"
df = df.sort_values(by=stats_key, ascending=False)
df = df[:num_show_repos]
# Main streamlit page (columns)
col1, col2 = st.columns(2, gap="large")
with col1:
st.markdown("### DataFrame")
st.markdown(f"#### Number of repositories: {len(df)}")
if selected_source_type == "GitHub":
stats_key = "stargazers_count"
else:
stats_key = "downloads"
if len(df) > 0:
mean_value = int(df[stats_key].mean())
min_value = int(df[stats_key].min())
max_value = int(df[stats_key].max())
st.markdown(
f"#### {stats_key} mean: {int(mean_value)}, min: {min_value},"
f" max: {max_value}"
)
st.dataframe(df, height=600)
if len(df) > 0:
st.markdown("### Word Cloud")
descriptions = df["tokenized_description"].tolist()
combined_text = " ".join(descriptions)
wordcloud = WordCloud(
width=800,
height=400,
font_path=japanize_matplotlib.get_font_ttf_path(),
max_words=50,
colormap="PuBu",
).generate(combined_text)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation="bilinear")
ax.axis("off")
st.pyplot(fig, use_container_width=True)
if selected_source_type == "GitHub":
all_languages = [
language
for languages_list in df["languages"]
for language in languages_list
]
language_counts = Counter(all_languages)
language_df = pd.DataFrame(
language_counts.items(), columns=["Language", "Count"]
)
language_df = language_df.sort_values(by="Count", ascending=False)
st.markdown("### Language Usage Table")
st.dataframe(language_df)
else:
st.markdown("### Model size vs downloads")
chart = (
alt.Chart(df)
.mark_circle(size=60)
.encode(
x="model_size",
y="downloads",
tooltip=["project_name", "model_size", "downloads"],
)
.properties(
title=(
"Relationship between model size (Billion) and"
" downloads"
),
)
.interactive()
)
st.altair_chart(chart, use_container_width=True)
with col2:
if selected_source_type == "GitHub":
vs_type = "stargazers_count"
else:
vs_type = "downloads"
st.markdown(f"### First commit vs {vs_type}")
chart = (
alt.Chart(df)
.mark_circle(size=60)
.encode(
x="first_commit:T",
y=f"{vs_type}:Q",
tooltip=["first_commit", "project_name", f"{vs_type}"],
)
.properties(
title=f"Relationship between first commit date and {vs_type}",
)
.interactive()
)
st.altair_chart(chart, use_container_width=True)
st.markdown(f"### Latest commit vs {vs_type}")
chart = (
alt.Chart(df)
.mark_circle(size=60)
.encode(
x="latest_commit:T",
y=f"{vs_type}:Q",
tooltip=["project_name", "latest_commit", f"{vs_type}"],
)
.properties(
title=f"Relationship between latest commit date and {vs_type}",
)
.interactive()
)
st.altair_chart(chart, use_container_width=True)
st.markdown(f"### Activity period vs {vs_type}")
chart = (
alt.Chart(df)
.mark_circle(size=60)
.encode(
x=alt.X("activity_period:Q", title="Activity Period (Days)"),
y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
tooltip=[
"project_name",
"activity_period",
f"{vs_type}",
],
)
.properties(
title=f"Relationship between activity period and {vs_type}",
)
.interactive()
)
st.altair_chart(chart, use_container_width=True)
projects_per_year = (
df.groupby("year").size().reset_index(name="project_count")
)
chart = (
alt.Chart(projects_per_year)
.mark_bar()
.encode(
x=alt.X("year:O", title="Year"),
y=alt.Y("project_count:Q", title="Number of repositories"),
tooltip=["year", "project_count"],
)
.properties(
title=(
"Number of projects per year based on the uear of the"
" first commit"
),
width=600,
height=400,
)
)
st.altair_chart(chart, use_container_width=True)
if __name__ == "__main__":
main()