|
import json |
|
from collections import Counter |
|
|
|
import altair as alt |
|
import japanize_matplotlib |
|
import matplotlib.pyplot as plt |
|
import nagisa |
|
import pandas as pd |
|
import streamlit as st |
|
from datasets import load_dataset |
|
from wordcloud import WordCloud |
|
|
|
|
|
def read_json(file_name): |
|
with open(file_name, "r") as f: |
|
json_data = json.load(f) |
|
return json_data |
|
|
|
|
|
@st.cache_data |
|
def convert_to_dataframe(): |
|
|
|
json_file = "awesome-japanese-nlp-resources-search.json" |
|
json_data = read_json(json_file) |
|
df = pd.DataFrame(json_data) |
|
|
|
|
|
df = df[ |
|
[ |
|
"project_name", |
|
"description", |
|
"url", |
|
"stargazers_count", |
|
"downloads", |
|
"model_architectures", |
|
"source", |
|
"score", |
|
"first_commit", |
|
"latest_commit", |
|
"languages", |
|
"model_or_dataset", |
|
"model_size", |
|
] |
|
] |
|
df = df.sort_values(by="score", ascending=False) |
|
|
|
|
|
df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce") |
|
df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce") |
|
df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days |
|
df = df[df["first_commit"] >= "2009-01-01"] |
|
df = df[df["latest_commit"] >= "2009-01-01"] |
|
df["str_languages"] = df["languages"].apply( |
|
lambda x: ",".join(x) if isinstance(x, list) else str(x) |
|
) |
|
df["year"] = df["first_commit"].dt.year |
|
|
|
dataset = load_dataset("taishi-i/nagisa_stopwords") |
|
stopwords = dataset["nagisa_stopwords"]["words"] |
|
|
|
def tokenize_description(description): |
|
description = description.lower() |
|
tokens = nagisa.filter(description, filter_postags=["ε©θ©", "ε©εθ©"]) |
|
words = tokens.words |
|
words = [word for word in words if len(word.strip()) > 0] |
|
words = [word for word in words if word not in stopwords] |
|
words = " ".join(words) |
|
return words |
|
|
|
df["tokenized_description"] = df["description"].apply(tokenize_description) |
|
return df |
|
|
|
|
|
def main(): |
|
|
|
title = "Awesome Japanese NLP resources Dashboard" |
|
icon = "π" |
|
|
|
st.set_page_config( |
|
page_title=title, |
|
page_icon=icon, |
|
layout="wide", |
|
initial_sidebar_state="expanded", |
|
) |
|
df = convert_to_dataframe() |
|
|
|
|
|
alt.themes.enable("dark") |
|
with st.sidebar: |
|
st.title(f"{title} {icon}") |
|
st.markdown( |
|
"You can search for open-source software from [1250+ Japanese NLP" |
|
" repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)." |
|
) |
|
|
|
query = st.text_input(label="Search keyword") |
|
|
|
|
|
source_type = ["Hugging Face", "GitHub"] |
|
selected_source_type = st.selectbox( |
|
"Choose a source type: Hugging Face or GitHub", source_type |
|
) |
|
|
|
|
|
df = df[df["source"] == selected_source_type] |
|
|
|
if selected_source_type == "GitHub": |
|
selected_model_or_dataset = None |
|
all_languages = ( |
|
df["languages"] |
|
.dropna() |
|
.apply(lambda x: x if isinstance(x, list) else []) |
|
.explode() |
|
.unique() |
|
) |
|
all_languages = [""] + all_languages.tolist() |
|
selected_languges = st.selectbox( |
|
"Choose a programming language", all_languages, index=0 |
|
) |
|
|
|
min_stars = int(df["stargazers_count"].min()) |
|
max_stars = int(df["stargazers_count"].max()) |
|
|
|
stars_range = st.slider( |
|
"Choose the range for the stargazer count", |
|
min_value=min_stars, |
|
max_value=max_stars, |
|
value=(min_stars, max_stars), |
|
) |
|
else: |
|
selected_languges = None |
|
selected_model_or_dataset = st.selectbox( |
|
"Choose a model or a dataset", |
|
["", "model", "dataset"], |
|
index=0, |
|
) |
|
|
|
model_architectures = df["model_architectures"].tolist() |
|
model_architectures_counts = Counter(model_architectures) |
|
del model_architectures_counts[None] |
|
|
|
model_architectures_counts = sorted( |
|
model_architectures_counts.items(), |
|
key=lambda x: x[1], |
|
reverse=True, |
|
) |
|
model_architectures_list = [""] + [ |
|
model_and_count[0] |
|
for model_and_count in model_architectures_counts |
|
] |
|
|
|
selected_model_architecture = st.selectbox( |
|
"Choose a model architecture", |
|
model_architectures_list, |
|
index=0, |
|
) |
|
|
|
min_downloads = int(df["downloads"].min()) |
|
max_downloads = int(df["downloads"].max()) |
|
|
|
downloads_range = st.slider( |
|
"Choose the range for the number of downloads", |
|
min_value=min_downloads, |
|
max_value=max_downloads, |
|
value=(min_downloads, max_downloads), |
|
) |
|
|
|
min_model_size = int(df["model_size"].min()) |
|
max_model_size = int(df["model_size"].max()) |
|
|
|
model_size_range = st.slider( |
|
"Choose the range for the model size (billion)", |
|
min_value=min_model_size, |
|
max_value=max_model_size, |
|
value=(min_model_size, max_model_size), |
|
) |
|
|
|
min_activity_period = int(df["activity_period"].min()) |
|
max_activity_period = int(df["activity_period"].max()) |
|
|
|
activity_period_range = st.slider( |
|
"Select the range for activity periods (in days)", |
|
min_value=min_activity_period, |
|
max_value=max_activity_period, |
|
value=(min_activity_period, max_activity_period), |
|
) |
|
years = sorted(list(set(df["year"].dropna().astype(int).tolist()))) |
|
|
|
selected_year_range = st.slider( |
|
"Select a range for the years of the first commit", |
|
min_value=min(years), |
|
max_value=max(years), |
|
value=(min(years), max(years)), |
|
) |
|
|
|
st.markdown("Sorted by") |
|
num_show_repos = st.number_input( |
|
"Number of sorted repositories", value=15 |
|
) |
|
latest_repos_btn = st.button("Latest repositories") |
|
trend_repos_btn = st.button("Trend repositories") |
|
|
|
df = df[ |
|
(df["year"] >= selected_year_range[0]) |
|
& (df["year"] <= selected_year_range[1]) |
|
] |
|
|
|
if selected_source_type == "GitHub": |
|
df = df[ |
|
(df["stargazers_count"] >= stars_range[0]) |
|
& (df["stargazers_count"] <= stars_range[1]) |
|
] |
|
else: |
|
if ( |
|
downloads_range[0] > min_downloads |
|
or downloads_range[1] < max_downloads |
|
): |
|
df = df[ |
|
(df["downloads"] >= downloads_range[0]) |
|
& (df["downloads"] <= downloads_range[1]) |
|
] |
|
|
|
if ( |
|
model_size_range[0] > min_model_size |
|
or model_size_range[1] < max_model_size |
|
): |
|
df = df[ |
|
(df["model_size"] >= model_size_range[0]) |
|
& (df["model_size"] <= model_size_range[1]) |
|
] |
|
|
|
df = df[ |
|
(df["activity_period"] >= activity_period_range[0]) |
|
& (df["activity_period"] <= activity_period_range[1]) |
|
] |
|
|
|
contained_description = df["description"].str.contains( |
|
query, case=False, na=False |
|
) |
|
contained_project_name = df["project_name"].str.contains( |
|
query, case=False, na=False |
|
) |
|
contained_model_arch = df["model_architectures"].str.contains( |
|
query, case=False, na=False |
|
) |
|
|
|
df = df[ |
|
contained_description | contained_project_name | contained_model_arch |
|
] |
|
|
|
if selected_languges: |
|
df = df[ |
|
df["str_languages"].str.contains( |
|
selected_languges, case=False, na=False |
|
) |
|
] |
|
|
|
if selected_model_or_dataset: |
|
df = df[ |
|
df["model_or_dataset"].str.contains( |
|
selected_model_or_dataset, case=False, na=False |
|
) |
|
] |
|
|
|
if selected_model_architecture: |
|
contained_model_arch = df["model_architectures"].str.contains( |
|
selected_model_architecture, case=False, na=False |
|
) |
|
df = df[contained_model_arch] |
|
|
|
if latest_repos_btn: |
|
df = df.sort_values(by="first_commit", ascending=False) |
|
df = df[:num_show_repos] |
|
|
|
if trend_repos_btn: |
|
if selected_source_type == "GitHub": |
|
stats_key = "stargazers_count" |
|
else: |
|
stats_key = "downloads" |
|
df = df.sort_values(by=stats_key, ascending=False) |
|
df = df[:num_show_repos] |
|
|
|
|
|
col1, col2 = st.columns(2, gap="large") |
|
|
|
with col1: |
|
st.markdown("### DataFrame") |
|
st.markdown(f"#### Number of repositories: {len(df)}") |
|
if selected_source_type == "GitHub": |
|
stats_key = "stargazers_count" |
|
else: |
|
stats_key = "downloads" |
|
|
|
if len(df) > 0: |
|
mean_value = int(df[stats_key].mean()) |
|
min_value = int(df[stats_key].min()) |
|
max_value = int(df[stats_key].max()) |
|
st.markdown( |
|
f"#### {stats_key} mean: {int(mean_value)}, min: {min_value}," |
|
f" max: {max_value}" |
|
) |
|
|
|
st.dataframe(df, height=600) |
|
|
|
if len(df) > 0: |
|
st.markdown("### Word Cloud") |
|
descriptions = df["tokenized_description"].tolist() |
|
combined_text = " ".join(descriptions) |
|
|
|
wordcloud = WordCloud( |
|
width=800, |
|
height=400, |
|
font_path=japanize_matplotlib.get_font_ttf_path(), |
|
max_words=50, |
|
colormap="PuBu", |
|
).generate(combined_text) |
|
|
|
fig, ax = plt.subplots() |
|
ax.imshow(wordcloud, interpolation="bilinear") |
|
ax.axis("off") |
|
st.pyplot(fig, use_container_width=True) |
|
|
|
if selected_source_type == "GitHub": |
|
all_languages = [ |
|
language |
|
for languages_list in df["languages"] |
|
for language in languages_list |
|
] |
|
language_counts = Counter(all_languages) |
|
language_df = pd.DataFrame( |
|
language_counts.items(), columns=["Language", "Count"] |
|
) |
|
language_df = language_df.sort_values(by="Count", ascending=False) |
|
|
|
st.markdown("### Language Usage Table") |
|
st.dataframe(language_df) |
|
else: |
|
st.markdown("### Model size vs downloads") |
|
chart = ( |
|
alt.Chart(df) |
|
.mark_circle(size=60) |
|
.encode( |
|
x="model_size", |
|
y="downloads", |
|
tooltip=["project_name", "model_size", "downloads"], |
|
) |
|
.properties( |
|
title=( |
|
"Relationship between model size (Billion) and" |
|
" downloads" |
|
), |
|
) |
|
.interactive() |
|
) |
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
model_architectures = df["model_architectures"].tolist() |
|
model_architectures_counts = Counter(model_architectures) |
|
del model_architectures_counts[None] |
|
model_architectures_df = pd.DataFrame( |
|
model_architectures_counts.items(), |
|
columns=["Model_architectures", "Count"], |
|
) |
|
model_architectures_df = model_architectures_df.sort_values( |
|
by="Count", ascending=False |
|
) |
|
|
|
st.markdown("### Model Architecture Table") |
|
st.dataframe(model_architectures_df) |
|
|
|
with col2: |
|
if selected_source_type == "GitHub": |
|
vs_type = "stargazers_count" |
|
else: |
|
vs_type = "downloads" |
|
|
|
st.markdown(f"### First commit vs {vs_type}") |
|
chart = ( |
|
alt.Chart(df) |
|
.mark_circle(size=60) |
|
.encode( |
|
x="first_commit:T", |
|
y=f"{vs_type}:Q", |
|
tooltip=["first_commit", "project_name", f"{vs_type}"], |
|
) |
|
.properties( |
|
title=f"Relationship between first commit date and {vs_type}", |
|
) |
|
.interactive() |
|
) |
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
st.markdown(f"### Latest commit vs {vs_type}") |
|
chart = ( |
|
alt.Chart(df) |
|
.mark_circle(size=60) |
|
.encode( |
|
x="latest_commit:T", |
|
y=f"{vs_type}:Q", |
|
tooltip=["project_name", "latest_commit", f"{vs_type}"], |
|
) |
|
.properties( |
|
title=f"Relationship between latest commit date and {vs_type}", |
|
) |
|
.interactive() |
|
) |
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
st.markdown(f"### Activity period vs {vs_type}") |
|
chart = ( |
|
alt.Chart(df) |
|
.mark_circle(size=60) |
|
.encode( |
|
x=alt.X("activity_period:Q", title="Activity Period (Days)"), |
|
y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"), |
|
tooltip=[ |
|
"project_name", |
|
"activity_period", |
|
f"{vs_type}", |
|
], |
|
) |
|
.properties( |
|
title=f"Relationship between activity period and {vs_type}", |
|
) |
|
.interactive() |
|
) |
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
projects_per_year = ( |
|
df.groupby("year").size().reset_index(name="project_count") |
|
) |
|
|
|
chart = ( |
|
alt.Chart(projects_per_year) |
|
.mark_bar() |
|
.encode( |
|
x=alt.X("year:O", title="Year"), |
|
y=alt.Y("project_count:Q", title="Number of repositories"), |
|
tooltip=["year", "project_count"], |
|
) |
|
.properties( |
|
title=( |
|
"Number of projects per year based on the uear of the" |
|
" first commit" |
|
), |
|
width=600, |
|
height=400, |
|
) |
|
) |
|
|
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|