Spaces:

taishi-i
/

awesome-japanese-nlp-resources-dashboard

Running

App Files Files Community

awesome-japanese-nlp-resources-dashboard / app.py

taishi-i

update app.py

7dcaa5c 25 days ago

raw

history blame

No virus

14.8 kB

	import json
	from collections import Counter

	import altair as alt
	import japanize_matplotlib
	import matplotlib.pyplot as plt
	import nagisa
	import pandas as pd
	import streamlit as st
	from datasets import load_dataset
	from wordcloud import WordCloud


	def read_json(file_name):
	with open(file_name, "r") as f:
	json_data = json.load(f)
	return json_data


	@st.cache_data
	def convert_to_dataframe():
	# Load a json file
	json_file = "awesome-japanese-nlp-resources-search.json"
	json_data = read_json(json_file)
	df = pd.DataFrame(json_data)

	# Sorted by selected columns
	df = df[
	[
	"project_name",
	"description",
	"url",
	"stargazers_count",
	"downloads",
	"model_architectures",
	"source",
	"score",
	"first_commit",
	"latest_commit",
	"languages",
	"model_or_dataset",
	"model_size",
	]
	]
	df = df.sort_values(by="score", ascending=False)

	# Convert DataFrame for Dashboard
	df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
	df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
	df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
	df = df[df["first_commit"] >= "2009-01-01"]
	df = df[df["latest_commit"] >= "2009-01-01"]
	df["str_languages"] = df["languages"].apply(
	lambda x: ",".join(x) if isinstance(x, list) else str(x)
	)
	df["year"] = df["first_commit"].dt.year

	dataset = load_dataset("taishi-i/nagisa_stopwords")
	stopwords = dataset["nagisa_stopwords"]["words"]

	def tokenize_description(description):
	description = description.lower()
	tokens = nagisa.filter(description, filter_postags=["助詞", "助動詞"])
	words = tokens.words
	words = [word for word in words if len(word.strip()) > 0]
	words = [word for word in words if word not in stopwords]
	words = " ".join(words)
	return words

	df["tokenized_description"] = df["description"].apply(tokenize_description)
	return df


	def main():
	# Set streamlit page settings
	title = "Awesome Japanese NLP resources Dashboard"
	icon = "🔎"

	st.set_page_config(
	page_title=title,
	page_icon=icon,
	layout="wide",
	initial_sidebar_state="expanded",
	)
	df = convert_to_dataframe()

	# Main streamlit page (sidebar)
	alt.themes.enable("dark")
	with st.sidebar:
	st.title(f"{title} {icon}")
	st.markdown(
	"You can search for open-source software from [1250+ Japanese NLP"
	" repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
	)

	query = st.text_input(label="Search keyword")

	# source_type = ["GitHub", "Hugging Face"]
	source_type = ["Hugging Face", "GitHub"]
	selected_source_type = st.selectbox(
	"Choose a source type: Hugging Face or GitHub", source_type
	)

	# Filtering GitHub or Hugging Face
	df = df[df["source"] == selected_source_type]

	if selected_source_type == "GitHub":
	selected_model_or_dataset = None
	all_languages = (
	df["languages"]
	.dropna()
	.apply(lambda x: x if isinstance(x, list) else [])
	.explode()
	.unique()
	)
	all_languages = [""] + all_languages.tolist()
	selected_languges = st.selectbox(
	"Choose a programming language", all_languages, index=0
	)

	min_stars = int(df["stargazers_count"].min())
	max_stars = int(df["stargazers_count"].max())

	stars_range = st.slider(
	"Choose the range for the stargazer count",
	min_value=min_stars,
	max_value=max_stars,
	value=(min_stars, max_stars),
	)
	else:
	selected_languges = None
	selected_model_or_dataset = st.selectbox(
	"Choose a model or a dataset",
	["", "model", "dataset"],
	index=0,
	)

	model_architectures = df["model_architectures"].tolist()
	model_architectures_counts = Counter(model_architectures)
	del model_architectures_counts[None]

	model_architectures_counts = sorted(
	model_architectures_counts.items(),
	key=lambda x: x[1],
	reverse=True,
	)
	model_architectures_list = [""] + [
	model_and_count[0]
	for model_and_count in model_architectures_counts
	]

	selected_model_architecture = st.selectbox(
	"Choose a model architecture",
	model_architectures_list,
	index=0,
	)

	min_downloads = int(df["downloads"].min())
	max_downloads = int(df["downloads"].max())

	downloads_range = st.slider(
	"Choose the range for the number of downloads",
	min_value=min_downloads,
	max_value=max_downloads,
	value=(min_downloads, max_downloads),
	)

	min_model_size = int(df["model_size"].min())
	max_model_size = int(df["model_size"].max())

	model_size_range = st.slider(
	"Choose the range for the model size (billion)",
	min_value=min_model_size,
	max_value=max_model_size,
	value=(min_model_size, max_model_size),
	)

	min_activity_period = int(df["activity_period"].min())
	max_activity_period = int(df["activity_period"].max())

	activity_period_range = st.slider(
	"Select the range for activity periods (in days)",
	min_value=min_activity_period,
	max_value=max_activity_period,
	value=(min_activity_period, max_activity_period),
	)
	years = sorted(list(set(df["year"].dropna().astype(int).tolist())))

	selected_year_range = st.slider(
	"Select a range for the years of the first commit",
	min_value=min(years),
	max_value=max(years),
	value=(min(years), max(years)),
	)

	st.markdown("Sorted by")
	num_show_repos = st.number_input(
	"Number of sorted repositories", value=15
	)
	latest_repos_btn = st.button("Latest repositories")
	trend_repos_btn = st.button("Trend repositories")

	df = df[
	(df["year"] >= selected_year_range[0])
	& (df["year"] <= selected_year_range[1])
	]

	if selected_source_type == "GitHub":
	df = df[
	(df["stargazers_count"] >= stars_range[0])
	& (df["stargazers_count"] <= stars_range[1])
	]
	else:
	if (
	downloads_range[0] > min_downloads
	or downloads_range[1] < max_downloads
	):
	df = df[
	(df["downloads"] >= downloads_range[0])
	& (df["downloads"] <= downloads_range[1])
	]

	if (
	model_size_range[0] > min_model_size
	or model_size_range[1] < max_model_size
	):
	df = df[
	(df["model_size"] >= model_size_range[0])
	& (df["model_size"] <= model_size_range[1])
	]

	df = df[
	(df["activity_period"] >= activity_period_range[0])
	& (df["activity_period"] <= activity_period_range[1])
	]

	contained_description = df["description"].str.contains(
	query, case=False, na=False
	)
	contained_project_name = df["project_name"].str.contains(
	query, case=False, na=False
	)
	contained_model_arch = df["model_architectures"].str.contains(
	query, case=False, na=False
	)

	df = df[
	contained_description \| contained_project_name \| contained_model_arch
	]

	if selected_languges:
	df = df[
	df["str_languages"].str.contains(
	selected_languges, case=False, na=False
	)
	]

	if selected_model_or_dataset:
	df = df[
	df["model_or_dataset"].str.contains(
	selected_model_or_dataset, case=False, na=False
	)
	]

	if selected_model_architecture:
	contained_model_arch = df["model_architectures"].str.contains(
	selected_model_architecture, case=False, na=False
	)
	df = df[contained_model_arch]

	if latest_repos_btn:
	df = df.sort_values(by="first_commit", ascending=False)
	df = df[:num_show_repos]

	if trend_repos_btn:
	if selected_source_type == "GitHub":
	stats_key = "stargazers_count"
	else:
	stats_key = "downloads"
	df = df.sort_values(by=stats_key, ascending=False)
	df = df[:num_show_repos]

	# Main streamlit page (columns)
	col1, col2 = st.columns(2, gap="large")

	with col1:
	st.markdown("### DataFrame")
	st.markdown(f"#### Number of repositories: {len(df)}")
	if selected_source_type == "GitHub":
	stats_key = "stargazers_count"
	else:
	stats_key = "downloads"

	if len(df) > 0:
	mean_value = int(df[stats_key].mean())
	min_value = int(df[stats_key].min())
	max_value = int(df[stats_key].max())
	st.markdown(
	f"#### {stats_key} mean: {int(mean_value)}, min: {min_value},"
	f" max: {max_value}"
	)

	st.dataframe(df, height=600)

	if len(df) > 0:
	st.markdown("### Word Cloud")
	descriptions = df["tokenized_description"].tolist()
	combined_text = " ".join(descriptions)

	wordcloud = WordCloud(
	width=800,
	height=400,
	font_path=japanize_matplotlib.get_font_ttf_path(),
	max_words=50,
	colormap="PuBu",
	).generate(combined_text)

	fig, ax = plt.subplots()
	ax.imshow(wordcloud, interpolation="bilinear")
	ax.axis("off")
	st.pyplot(fig, use_container_width=True)

	if selected_source_type == "GitHub":
	all_languages = [
	language
	for languages_list in df["languages"]
	for language in languages_list
	]
	language_counts = Counter(all_languages)
	language_df = pd.DataFrame(
	language_counts.items(), columns=["Language", "Count"]
	)
	language_df = language_df.sort_values(by="Count", ascending=False)

	st.markdown("### Language Usage Table")
	st.dataframe(language_df)
	else:
	st.markdown("### Model size vs downloads")
	chart = (
	alt.Chart(df)
	.mark_circle(size=60)
	.encode(
	x="model_size",
	y="downloads",
	tooltip=["project_name", "model_size", "downloads"],
	)
	.properties(
	title=(
	"Relationship between model size (Billion) and"
	" downloads"
	),
	)
	.interactive()
	)
	st.altair_chart(chart, use_container_width=True)

	model_architectures = df["model_architectures"].tolist()
	model_architectures_counts = Counter(model_architectures)
	del model_architectures_counts[None]
	model_architectures_df = pd.DataFrame(
	model_architectures_counts.items(),
	columns=["Model_architectures", "Count"],
	)
	model_architectures_df = model_architectures_df.sort_values(
	by="Count", ascending=False
	)

	st.markdown("### Model Architecture Table")
	st.dataframe(model_architectures_df)

	with col2:
	if selected_source_type == "GitHub":
	vs_type = "stargazers_count"
	else:
	vs_type = "downloads"

	st.markdown(f"### First commit vs {vs_type}")
	chart = (
	alt.Chart(df)
	.mark_circle(size=60)
	.encode(
	x="first_commit:T",
	y=f"{vs_type}:Q",
	tooltip=["first_commit", "project_name", f"{vs_type}"],
	)
	.properties(
	title=f"Relationship between first commit date and {vs_type}",
	)
	.interactive()
	)
	st.altair_chart(chart, use_container_width=True)

	st.markdown(f"### Latest commit vs {vs_type}")
	chart = (
	alt.Chart(df)
	.mark_circle(size=60)
	.encode(
	x="latest_commit:T",
	y=f"{vs_type}:Q",
	tooltip=["project_name", "latest_commit", f"{vs_type}"],
	)
	.properties(
	title=f"Relationship between latest commit date and {vs_type}",
	)
	.interactive()
	)
	st.altair_chart(chart, use_container_width=True)

	st.markdown(f"### Activity period vs {vs_type}")
	chart = (
	alt.Chart(df)
	.mark_circle(size=60)
	.encode(
	x=alt.X("activity_period:Q", title="Activity Period (Days)"),
	y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
	tooltip=[
	"project_name",
	"activity_period",
	f"{vs_type}",
	],
	)
	.properties(
	title=f"Relationship between activity period and {vs_type}",
	)
	.interactive()
	)
	st.altair_chart(chart, use_container_width=True)

	projects_per_year = (
	df.groupby("year").size().reset_index(name="project_count")
	)

	chart = (
	alt.Chart(projects_per_year)
	.mark_bar()
	.encode(
	x=alt.X("year:O", title="Year"),
	y=alt.Y("project_count:Q", title="Number of repositories"),
	tooltip=["year", "project_count"],
	)
	.properties(
	title=(
	"Number of projects per year based on the uear of the"
	" first commit"
	),
	width=600,
	height=400,
	)
	)

	st.altair_chart(chart, use_container_width=True)


	if __name__ == "__main__":
	main()