import json from collections import Counter import altair as alt import japanize_matplotlib import matplotlib.pyplot as plt import nagisa import pandas as pd import streamlit as st from datasets import load_dataset from wordcloud import WordCloud def read_json(file_name): with open(file_name, "r") as f: json_data = json.load(f) return json_data @st.cache_data def convert_to_dataframe(): # Load a json file json_file = "awesome-japanese-nlp-resources-search.json" json_data = read_json(json_file) df = pd.DataFrame(json_data) # Sorted by selected columns df = df[ [ "project_name", "description", "url", "stargazers_count", "downloads", "model_architectures", "source", "score", "first_commit", "latest_commit", "languages", "model_or_dataset", "model_size", ] ] df = df.sort_values(by="score", ascending=False) # Convert DataFrame for Dashboard df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce") df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce") df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days df = df[df["first_commit"] >= "2009-01-01"] df = df[df["latest_commit"] >= "2009-01-01"] df["str_languages"] = df["languages"].apply( lambda x: ",".join(x) if isinstance(x, list) else str(x) ) df["year"] = df["first_commit"].dt.year dataset = load_dataset("taishi-i/nagisa_stopwords") stopwords = dataset["nagisa_stopwords"]["words"] def tokenize_description(description): description = description.lower() tokens = nagisa.filter(description, filter_postags=["助詞", "助動詞"]) words = tokens.words words = [word for word in words if len(word.strip()) > 0] words = [word for word in words if word not in stopwords] words = " ".join(words) return words df["tokenized_description"] = df["description"].apply(tokenize_description) return df def main(): # Set streamlit page settings title = "Awesome Japanese NLP resources Dashboard" icon = "🔎" st.set_page_config( page_title=title, page_icon=icon, layout="wide", initial_sidebar_state="expanded", ) df = convert_to_dataframe() # Main streamlit page (sidebar) alt.themes.enable("dark") with st.sidebar: st.title(f"{title} {icon}") st.markdown( "You can search for open-source software from [1250+ Japanese NLP" " repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)." ) query = st.text_input(label="Search keyword") # source_type = ["GitHub", "Hugging Face"] source_type = ["Hugging Face", "GitHub"] selected_source_type = st.selectbox( "Choose a source type: Hugging Face or GitHub", source_type ) # Filtering GitHub or Hugging Face df = df[df["source"] == selected_source_type] if selected_source_type == "GitHub": selected_model_or_dataset = None all_languages = ( df["languages"] .dropna() .apply(lambda x: x if isinstance(x, list) else []) .explode() .unique() ) all_languages = [""] + all_languages.tolist() selected_languges = st.selectbox( "Choose a programming language", all_languages, index=0 ) min_stars = int(df["stargazers_count"].min()) max_stars = int(df["stargazers_count"].max()) stars_range = st.slider( "Choose the range for the stargazer count", min_value=min_stars, max_value=max_stars, value=(min_stars, max_stars), ) else: selected_languges = None selected_model_or_dataset = st.selectbox( "Choose a model or a dataset", ["", "model", "dataset"], index=0, ) model_architectures = df["model_architectures"].tolist() model_architectures_counts = Counter(model_architectures) del model_architectures_counts[None] model_architectures_counts = sorted( model_architectures_counts.items(), key=lambda x: x[1], reverse=True, ) model_architectures_list = [""] + [ model_and_count[0] for model_and_count in model_architectures_counts ] selected_model_architecture = st.selectbox( "Choose a model architecture", model_architectures_list, index=0, ) min_downloads = int(df["downloads"].min()) max_downloads = int(df["downloads"].max()) downloads_range = st.slider( "Choose the range for the number of downloads", min_value=min_downloads, max_value=max_downloads, value=(min_downloads, max_downloads), ) min_model_size = int(df["model_size"].min()) max_model_size = int(df["model_size"].max()) model_size_range = st.slider( "Choose the range for the model size (billion)", min_value=min_model_size, max_value=max_model_size, value=(min_model_size, max_model_size), ) min_activity_period = int(df["activity_period"].min()) max_activity_period = int(df["activity_period"].max()) activity_period_range = st.slider( "Select the range for activity periods (in days)", min_value=min_activity_period, max_value=max_activity_period, value=(min_activity_period, max_activity_period), ) years = sorted(list(set(df["year"].dropna().astype(int).tolist()))) selected_year_range = st.slider( "Select a range for the years of the first commit", min_value=min(years), max_value=max(years), value=(min(years), max(years)), ) st.markdown("Sorted by") num_show_repos = st.number_input( "Number of sorted repositories", value=15 ) latest_repos_btn = st.button("Latest repositories") trend_repos_btn = st.button("Trend repositories") df = df[ (df["year"] >= selected_year_range[0]) & (df["year"] <= selected_year_range[1]) ] if selected_source_type == "GitHub": df = df[ (df["stargazers_count"] >= stars_range[0]) & (df["stargazers_count"] <= stars_range[1]) ] else: if ( downloads_range[0] > min_downloads or downloads_range[1] < max_downloads ): df = df[ (df["downloads"] >= downloads_range[0]) & (df["downloads"] <= downloads_range[1]) ] if ( model_size_range[0] > min_model_size or model_size_range[1] < max_model_size ): df = df[ (df["model_size"] >= model_size_range[0]) & (df["model_size"] <= model_size_range[1]) ] df = df[ (df["activity_period"] >= activity_period_range[0]) & (df["activity_period"] <= activity_period_range[1]) ] contained_description = df["description"].str.contains( query, case=False, na=False ) contained_project_name = df["project_name"].str.contains( query, case=False, na=False ) contained_model_arch = df["model_architectures"].str.contains( query, case=False, na=False ) df = df[ contained_description | contained_project_name | contained_model_arch ] if selected_languges: df = df[ df["str_languages"].str.contains( selected_languges, case=False, na=False ) ] if selected_model_or_dataset: df = df[ df["model_or_dataset"].str.contains( selected_model_or_dataset, case=False, na=False ) ] if selected_model_architecture: contained_model_arch = df["model_architectures"].str.contains( selected_model_architecture, case=False, na=False ) df = df[contained_model_arch] if latest_repos_btn: df = df.sort_values(by="first_commit", ascending=False) df = df[:num_show_repos] if trend_repos_btn: if selected_source_type == "GitHub": stats_key = "stargazers_count" else: stats_key = "downloads" df = df.sort_values(by=stats_key, ascending=False) df = df[:num_show_repos] # Main streamlit page (columns) col1, col2 = st.columns(2, gap="large") with col1: st.markdown("### DataFrame") st.markdown(f"#### Number of repositories: {len(df)}") if selected_source_type == "GitHub": stats_key = "stargazers_count" else: stats_key = "downloads" if len(df) > 0: mean_value = int(df[stats_key].mean()) min_value = int(df[stats_key].min()) max_value = int(df[stats_key].max()) st.markdown( f"#### {stats_key} mean: {int(mean_value)}, min: {min_value}," f" max: {max_value}" ) st.dataframe(df, height=600) if len(df) > 0: st.markdown("### Word Cloud") descriptions = df["tokenized_description"].tolist() combined_text = " ".join(descriptions) wordcloud = WordCloud( width=800, height=400, font_path=japanize_matplotlib.get_font_ttf_path(), max_words=50, colormap="PuBu", ).generate(combined_text) fig, ax = plt.subplots() ax.imshow(wordcloud, interpolation="bilinear") ax.axis("off") st.pyplot(fig, use_container_width=True) if selected_source_type == "GitHub": all_languages = [ language for languages_list in df["languages"] for language in languages_list ] language_counts = Counter(all_languages) language_df = pd.DataFrame( language_counts.items(), columns=["Language", "Count"] ) language_df = language_df.sort_values(by="Count", ascending=False) st.markdown("### Language Usage Table") st.dataframe(language_df) else: st.markdown("### Model size vs downloads") chart = ( alt.Chart(df) .mark_circle(size=60) .encode( x="model_size", y="downloads", tooltip=["project_name", "model_size", "downloads"], ) .properties( title=( "Relationship between model size (Billion) and" " downloads" ), ) .interactive() ) st.altair_chart(chart, use_container_width=True) model_architectures = df["model_architectures"].tolist() model_architectures_counts = Counter(model_architectures) del model_architectures_counts[None] model_architectures_df = pd.DataFrame( model_architectures_counts.items(), columns=["Model_architectures", "Count"], ) model_architectures_df = model_architectures_df.sort_values( by="Count", ascending=False ) st.markdown("### Model Architecture Table") st.dataframe(model_architectures_df) with col2: if selected_source_type == "GitHub": vs_type = "stargazers_count" else: vs_type = "downloads" st.markdown(f"### First commit vs {vs_type}") chart = ( alt.Chart(df) .mark_circle(size=60) .encode( x="first_commit:T", y=f"{vs_type}:Q", tooltip=["first_commit", "project_name", f"{vs_type}"], ) .properties( title=f"Relationship between first commit date and {vs_type}", ) .interactive() ) st.altair_chart(chart, use_container_width=True) st.markdown(f"### Latest commit vs {vs_type}") chart = ( alt.Chart(df) .mark_circle(size=60) .encode( x="latest_commit:T", y=f"{vs_type}:Q", tooltip=["project_name", "latest_commit", f"{vs_type}"], ) .properties( title=f"Relationship between latest commit date and {vs_type}", ) .interactive() ) st.altair_chart(chart, use_container_width=True) st.markdown(f"### Activity period vs {vs_type}") chart = ( alt.Chart(df) .mark_circle(size=60) .encode( x=alt.X("activity_period:Q", title="Activity Period (Days)"), y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"), tooltip=[ "project_name", "activity_period", f"{vs_type}", ], ) .properties( title=f"Relationship between activity period and {vs_type}", ) .interactive() ) st.altair_chart(chart, use_container_width=True) projects_per_year = ( df.groupby("year").size().reset_index(name="project_count") ) chart = ( alt.Chart(projects_per_year) .mark_bar() .encode( x=alt.X("year:O", title="Year"), y=alt.Y("project_count:Q", title="Number of repositories"), tooltip=["year", "project_count"], ) .properties( title=( "Number of projects per year based on the uear of the" " first commit" ), width=600, height=400, ) ) st.altair_chart(chart, use_container_width=True) if __name__ == "__main__": main()