import json |
from collections import Counter |
import altair as alt |
import japanize_matplotlib |
import matplotlib.pyplot as plt |
import nagisa |
import pandas as pd |
import streamlit as st |
from datasets import load_dataset |
from wordcloud import WordCloud |
def read_json(file_name): |
with open(file_name, "r") as f: |
json_data = json.load(f) |
return json_data |
@st.cache_data |
def convert_to_dataframe(): |
json_file = "awesome-japanese-nlp-resources-search.json" |
json_data = read_json(json_file) |
df = pd.DataFrame(json_data) |
df = df[ |
[ |
"project_name", |
"description", |
"url", |
"stargazers_count", |
"downloads", |
"source", |
"score", |
"first_commit", |
"latest_commit", |
"languages", |
"model_or_dataset", |
"model_size", |
] |
] |
df = df.sort_values(by="score", ascending=False) |
df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce") |
df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce") |
df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days |
df = df[df["first_commit"] >= "2009-01-01"] |
df = df[df["latest_commit"] >= "2009-01-01"] |
df["str_languages"] = df["languages"].apply( |
lambda x: ",".join(x) if isinstance(x, list) else str(x) |
) |
df["year"] = df["first_commit"].dt.year |
dataset = load_dataset("taishi-i/nagisa_stopwords") |
stopwords = dataset["nagisa_stopwords"]["words"] |
def tokenize_description(description): |
description = description.lower() |
tokens = nagisa.filter(description, filter_postags=["ε©θ©", "ε©εθ©"]) |
words = tokens.words |
words = [word for word in words if len(word.strip()) > 0] |
words = [word for word in words if word not in stopwords] |
words = " ".join(words) |
return words |
df["tokenized_description"] = df["description"].apply(tokenize_description) |
return df |
def main(): |
title = "Awesome Japanese NLP resources Dashboard" |
icon = "π" |
st.set_page_config( |
page_title=title, |
page_icon=icon, |
layout="wide", |
initial_sidebar_state="expanded", |
) |
df = convert_to_dataframe() |
alt.themes.enable("dark") |
with st.sidebar: |
st.title(f"{title} {icon}") |
st.markdown( |
"You can search for open-source software from [1250+ Japanese NLP" |
" repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)." |
) |
query = st.text_input(label="Search keyword") |
source_type = ["Hugging Face", "GitHub"] |
selected_source_type = st.selectbox( |
"Choose a source type: Hugging Face or GitHub", source_type |
) |
df = df[df["source"] == selected_source_type] |
if selected_source_type == "GitHub": |
selected_model_or_dataset = None |
all_languages = ( |
df["languages"] |
.dropna() |
.apply(lambda x: x if isinstance(x, list) else []) |
.explode() |
.unique() |
) |
all_languages = [""] + all_languages.tolist() |
selected_languges = st.selectbox( |
"Choose a programming language", all_languages, index=0 |
) |
min_stars = int(df["stargazers_count"].min()) |
max_stars = int(df["stargazers_count"].max()) |
stars_range = st.slider( |
"Choose the range for the stargazer count", |
min_value=min_stars, |
max_value=max_stars, |
value=(min_stars, max_stars), |
) |
else: |
selected_languges = None |
selected_model_or_dataset = st.selectbox( |
"Choose a model or a dataset", |
["", "model", "dataset"], |
index=0, |
) |
min_downloads = int(df["downloads"].min()) |
max_downloads = int(df["downloads"].max()) |
downloads_range = st.slider( |
"Choose the range for the number of downloads", |
min_value=min_downloads, |
max_value=max_downloads, |
value=(min_downloads, max_downloads), |
) |
min_model_size = int(df["model_size"].min()) |
max_model_size = int(df["model_size"].max()) |
model_size_range = st.slider( |
"Choose the range for the model size (billion)", |
min_value=min_model_size, |
max_value=max_model_size, |
value=(min_model_size, max_model_size), |
) |
min_activity_period = int(df["activity_period"].min()) |
max_activity_period = int(df["activity_period"].max()) |
activity_period_range = st.slider( |
"Select the range for activity periods (in days)", |
min_value=min_activity_period, |
max_value=max_activity_period, |
value=(min_activity_period, max_activity_period), |
) |
years = sorted(list(set(df["year"].dropna().astype(int).tolist()))) |
selected_year_range = st.slider( |
"Select a range for the years of the first commit", |
min_value=min(years), |
max_value=max(years), |
value=(min(years), max(years)), |
) |
st.markdown("Sorted by") |
num_show_repos = st.number_input( |
"Number of sorted repositories", value=15 |
) |
latest_repos_btn = st.button("Latest repositories") |
trend_repos_btn = st.button("Trend repositories") |
df = df[ |
(df["year"] >= selected_year_range[0]) |
& (df["year"] <= selected_year_range[1]) |
] |
if selected_source_type == "GitHub": |
df = df[ |
(df["stargazers_count"] >= stars_range[0]) |
& (df["stargazers_count"] <= stars_range[1]) |
] |
else: |
if ( |
downloads_range[0] > min_downloads |
or downloads_range[1] < max_downloads |
): |
df = df[ |
(df["downloads"] >= downloads_range[0]) |
& (df["downloads"] <= downloads_range[1]) |
] |
if ( |
model_size_range[0] > min_model_size |
or model_size_range[1] < max_model_size |
): |
df = df[ |
(df["model_size"] >= model_size_range[0]) |
& (df["model_size"] <= model_size_range[1]) |
] |
df = df[ |
(df["activity_period"] >= activity_period_range[0]) |
& (df["activity_period"] <= activity_period_range[1]) |
] |
contained_description = df["description"].str.contains( |
query, case=False, na=False |
) |
contained_project_name = df["project_name"].str.contains( |
query, case=False, na=False |
) |
df = df[contained_description | contained_project_name] |
if selected_languges: |
df = df[ |
df["str_languages"].str.contains( |
selected_languges, case=False, na=False |
) |
] |
if selected_model_or_dataset: |
df = df[ |
df["model_or_dataset"].str.contains( |
selected_model_or_dataset, case=False, na=False |
) |
] |
if latest_repos_btn: |
df = df.sort_values(by="first_commit", ascending=False) |
df = df[:num_show_repos] |
if trend_repos_btn: |
if selected_source_type == "GitHub": |
stats_key = "stargazers_count" |
else: |
stats_key = "downloads" |
df = df.sort_values(by=stats_key, ascending=False) |
df = df[:num_show_repos] |
col1, col2 = st.columns(2, gap="large") |
with col1: |
st.markdown("### DataFrame") |
st.markdown(f"#### Number of repositories: {len(df)}") |
if selected_source_type == "GitHub": |
stats_key = "stargazers_count" |
else: |
stats_key = "downloads" |
if len(df) > 0: |
mean_value = int(df[stats_key].mean()) |
min_value = int(df[stats_key].min()) |
max_value = int(df[stats_key].max()) |
st.markdown( |
f"#### {stats_key} mean: {int(mean_value)}, min: {min_value}," |
f" max: {max_value}" |
) |
st.dataframe(df, height=600) |
if len(df) > 0: |
st.markdown("### Word Cloud") |
descriptions = df["tokenized_description"].tolist() |
combined_text = " ".join(descriptions) |
wordcloud = WordCloud( |
width=800, |
height=400, |
font_path=japanize_matplotlib.get_font_ttf_path(), |
max_words=50, |
colormap="PuBu", |
).generate(combined_text) |
fig, ax = plt.subplots() |
ax.imshow(wordcloud, interpolation="bilinear") |
ax.axis("off") |
st.pyplot(fig, use_container_width=True) |
if selected_source_type == "GitHub": |
all_languages = [ |
language |
for languages_list in df["languages"] |
for language in languages_list |
] |
language_counts = Counter(all_languages) |
language_df = pd.DataFrame( |
language_counts.items(), columns=["Language", "Count"] |
) |
language_df = language_df.sort_values(by="Count", ascending=False) |
st.markdown("### Language Usage Table") |
st.dataframe(language_df) |
else: |
st.markdown("### Model size vs downloads") |
chart = ( |
alt.Chart(df) |
.mark_circle(size=60) |
.encode( |
x="model_size", |
y="downloads", |
tooltip=["project_name", "model_size", "downloads"], |
) |
.properties( |
title=( |
"Relationship between model size (Billion) and" |
" downloads" |
), |
) |
.interactive() |
) |
st.altair_chart(chart, use_container_width=True) |
with col2: |
if selected_source_type == "GitHub": |
vs_type = "stargazers_count" |
else: |
vs_type = "downloads" |
st.markdown(f"### First commit vs {vs_type}") |
chart = ( |
alt.Chart(df) |
.mark_circle(size=60) |
.encode( |
x="first_commit:T", |
y=f"{vs_type}:Q", |
tooltip=["first_commit", "project_name", f"{vs_type}"], |
) |
.properties( |
title=f"Relationship between first commit date and {vs_type}", |
) |
.interactive() |
) |
st.altair_chart(chart, use_container_width=True) |
st.markdown(f"### Latest commit vs {vs_type}") |
chart = ( |
alt.Chart(df) |
.mark_circle(size=60) |
.encode( |
x="latest_commit:T", |
y=f"{vs_type}:Q", |
tooltip=["project_name", "latest_commit", f"{vs_type}"], |
) |
.properties( |
title=f"Relationship between latest commit date and {vs_type}", |
) |
.interactive() |
) |
st.altair_chart(chart, use_container_width=True) |
st.markdown(f"### Activity period vs {vs_type}") |
chart = ( |
alt.Chart(df) |
.mark_circle(size=60) |
.encode( |
x=alt.X("activity_period:Q", title="Activity Period (Days)"), |
y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"), |
tooltip=[ |
"project_name", |
"activity_period", |
f"{vs_type}", |
], |
) |
.properties( |
title=f"Relationship between activity period and {vs_type}", |
) |
.interactive() |
) |
st.altair_chart(chart, use_container_width=True) |
projects_per_year = ( |
df.groupby("year").size().reset_index(name="project_count") |
) |
chart = ( |
alt.Chart(projects_per_year) |
.mark_bar() |
.encode( |
x=alt.X("year:O", title="Year"), |
y=alt.Y("project_count:Q", title="Number of repositories"), |
tooltip=["year", "project_count"], |
) |
.properties( |
title=( |
"Number of projects per year based on the uear of the" |
" first commit" |
), |
width=600, |
height=400, |
) |
) |
st.altair_chart(chart, use_container_width=True) |
if __name__ == "__main__": |
main() |