taishi-i commited on
Commit
2ffaad6
β€’
1 Parent(s): 1db9748

update app.py

Browse files
Files changed (2) hide show
  1. app.py +282 -226
  2. requirements.txt +5 -0
app.py CHANGED
@@ -1,8 +1,13 @@
1
  import json
2
 
3
  import altair as alt
 
 
 
4
  import pandas as pd
5
  import streamlit as st
 
 
6
 
7
 
8
  def read_json(file_name):
@@ -11,256 +16,307 @@ def read_json(file_name):
11
  return json_data
12
 
13
 
14
- # Load a json file
15
- json_file = "awesome-japanese-nlp-resources-search.json"
16
- json_data = read_json(json_file)
17
- df = pd.DataFrame(json_data)
18
-
19
- # Sorted by selected columns
20
- df = df[
21
- [
22
- "project_name",
23
- "description",
24
- "url",
25
- "stargazers_count",
26
- "downloads",
27
- "source",
28
- "score",
29
- "first_commit",
30
- "latest_commit",
31
- "languages",
32
- "model_or_dataset",
33
- ]
34
- ]
35
- df = df.sort_values(by="score", ascending=False)
36
-
37
-
38
- # Convert DataFrame for Dashboard
39
- df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
40
- df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
41
- df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
42
- df = df[df["first_commit"] >= "2009-01-01"]
43
- df = df[df["latest_commit"] >= "2009-01-01"]
44
- df["str_languages"] = df["languages"].apply(
45
- lambda x: ",".join(x) if isinstance(x, list) else str(x)
46
- )
47
- df["year"] = df["first_commit"].dt.year
48
-
49
-
50
- # Set streamlit page settings
51
- title = "Awesome Japanese NLP resources Dashboard"
52
- icon = "πŸ”Ž"
53
-
54
- st.set_page_config(
55
- page_title=title,
56
- page_icon=icon,
57
- layout="wide",
58
- initial_sidebar_state="expanded",
59
- )
60
-
61
- # Main streamlit page (sidebar)
62
- alt.themes.enable("dark")
63
- with st.sidebar:
64
- st.title(f"{title} {icon}")
65
- st.markdown(
66
- "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
67
- )
68
- query = st.text_input(label="Search keyword")
69
 
70
- source_type = ["GitHub", "Hugging Face"]
71
- selected_source_type = st.selectbox(
72
- "Choose a source type: GitHub or Hugging Face", source_type
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  )
 
74
 
75
- # Filtering GitHub or Hugging Face
76
- df = df[df["source"] == selected_source_type]
77
-
78
- if selected_source_type == "GitHub":
79
- selected_model_or_dataset = None
80
- all_languages = (
81
- df["languages"]
82
- .dropna()
83
- .apply(lambda x: x if isinstance(x, list) else [])
84
- .explode()
85
- .unique()
86
- )
87
- all_languages = [""] + all_languages.tolist()
88
- selected_languges = st.selectbox(
89
- "Choose a programming language", all_languages, index=0
90
- )
91
-
92
- min_stars = int(df["stargazers_count"].min())
93
- max_stars = int(df["stargazers_count"].max())
94
-
95
- stars_range = st.slider(
96
- "Choose the range for the stargazer count",
97
- min_value=min_stars,
98
- max_value=max_stars,
99
- value=(min_stars, max_stars),
100
- )
101
- else:
102
- selected_languges = None
103
- selected_model_or_dataset = st.selectbox(
104
- "Choose a model or a dataset", ["", "model", "dataset"], index=0
105
- )
106
 
107
- min_downloads = int(df["downloads"].min())
108
- max_downloads = int(df["downloads"].max())
 
 
 
 
 
109
 
110
- downloads_range = st.slider(
111
- "Choose the range for the number of downloads",
112
- min_value=min_downloads,
113
- max_value=max_downloads,
114
- value=(min_downloads, max_downloads),
115
- )
116
 
117
- min_activity_period = int(df["activity_period"].min())
118
- max_activity_period = int(df["activity_period"].max())
119
 
120
- activity_period_range = st.slider(
121
- "Select the range for activity periods (in days)",
122
- min_value=min_activity_period,
123
- max_value=max_activity_period,
124
- value=(min_activity_period, max_activity_period),
125
- )
126
- years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
127
 
128
- selected_year_range = st.slider(
129
- "Select a range for the years of the first commit",
130
- min_value=min(years),
131
- max_value=max(years),
132
- value=(min(years), max(years)),
133
  )
 
 
 
 
 
 
 
 
 
 
134
 
 
 
 
 
135
 
136
- df = df[
137
- (df["year"] >= selected_year_range[0])
138
- & (df["year"] <= selected_year_range[1])
139
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
 
 
 
 
 
 
141
 
142
- if selected_source_type == "GitHub":
143
- df = df[
144
- (df["stargazers_count"] >= stars_range[0])
145
- & (df["stargazers_count"] <= stars_range[1])
146
- ]
147
- else:
148
  df = df[
149
- (df["downloads"] >= downloads_range[0])
150
- & (df["downloads"] <= downloads_range[1])
151
  ]
152
 
153
- df = df[
154
- (df["activity_period"] >= activity_period_range[0])
155
- & (df["activity_period"] <= activity_period_range[1])
156
- ]
157
-
158
- contained_description = df["description"].str.contains(
159
- query, case=False, na=False
160
- )
161
- contained_project_name = df["project_name"].str.contains(
162
- query, case=False, na=False
163
- )
164
- df = df[contained_description | contained_project_name]
165
-
166
- if selected_languges:
167
- df = df[
168
- df["str_languages"].str.contains(
169
- selected_languges, case=False, na=False
170
- )
171
- ]
172
 
173
- if selected_model_or_dataset:
174
  df = df[
175
- df["model_or_dataset"].str.contains(
176
- selected_model_or_dataset, case=False, na=False
177
- )
178
  ]
179
 
180
-
181
- # Main streamlit page (columns)
182
- col1, col2 = st.columns(2, gap="large")
183
-
184
- with col1:
185
- st.markdown("### DataFrame")
186
- st.markdown(f"#### Number of repositories: {len(df)}")
187
- st.dataframe(df, height=600)
188
-
189
- projects_per_year = (
190
- df.groupby("year").size().reset_index(name="project_count")
191
  )
192
-
193
- chart = (
194
- alt.Chart(projects_per_year)
195
- .mark_bar()
196
- .encode(
197
- x=alt.X("year:O", title="Year"),
198
- y=alt.Y("project_count:Q", title="Number of repositories"),
199
- tooltip=["year", "project_count"],
200
- )
201
- .properties(
202
- title="Number of projects per year based on the uear of the first commit",
203
- width=600,
204
- height=400,
205
- )
206
  )
207
-
208
- st.altair_chart(chart, use_container_width=True)
209
-
210
- with col2:
211
- if selected_source_type == "GitHub":
212
- vs_type = "stargazers_count"
213
- else:
214
- vs_type = "downloads"
215
-
216
- st.markdown(f"### First commit vs {vs_type}")
217
- chart = (
218
- alt.Chart(df)
219
- .mark_circle(size=60)
220
- .encode(
221
- x="first_commit:T",
222
- y=f"{vs_type}:Q",
223
- tooltip=["first_commit", "project_name", f"{vs_type}"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
- .properties(
226
- title=f"Relationship between first commit date and {vs_type}",
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  )
228
- .interactive()
229
- )
230
- st.altair_chart(chart, use_container_width=True)
231
-
232
- st.markdown(f"### Latest commit vs {vs_type}")
233
- chart = (
234
- alt.Chart(df)
235
- .mark_circle(size=60)
236
- .encode(
237
- x="latest_commit:T",
238
- y=f"{vs_type}:Q",
239
- tooltip=["project_name", "latest_commit", f"{vs_type}"],
240
- )
241
- .properties(
242
- title=f"Relationship between latest commit date and {vs_type}",
 
 
 
 
243
  )
244
- .interactive()
245
- )
246
- st.altair_chart(chart, use_container_width=True)
247
-
248
- st.markdown(f"### Activity period vs {vs_type}")
249
- chart = (
250
- alt.Chart(df)
251
- .mark_circle(size=60)
252
- .encode(
253
- x=alt.X("activity_period:Q", title="Activity Period (Days)"),
254
- y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
255
- tooltip=[
256
- "project_name",
257
- "activity_period",
258
- f"{vs_type}",
259
- ],
260
  )
261
- .properties(
262
- title=f"Relationship between activity period and {vs_type}",
 
 
 
 
 
 
 
 
 
 
 
 
263
  )
264
- .interactive()
265
- )
266
- st.altair_chart(chart, use_container_width=True)
 
 
 
 
1
  import json
2
 
3
  import altair as alt
4
+ import japanize_matplotlib
5
+ import matplotlib.pyplot as plt
6
+ import nagisa
7
  import pandas as pd
8
  import streamlit as st
9
+ from datasets import load_dataset
10
+ from wordcloud import WordCloud
11
 
12
 
13
  def read_json(file_name):
 
16
  return json_data
17
 
18
 
19
+ @st.cache_data
20
+ def convert_to_dataframe():
21
+ # Load a json file
22
+ json_file = "awesome-japanese-nlp-resources-search.json"
23
+ json_data = read_json(json_file)
24
+ df = pd.DataFrame(json_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Sorted by selected columns
27
+ df = df[
28
+ [
29
+ "project_name",
30
+ "description",
31
+ "url",
32
+ "stargazers_count",
33
+ "downloads",
34
+ "source",
35
+ "score",
36
+ "first_commit",
37
+ "latest_commit",
38
+ "languages",
39
+ "model_or_dataset",
40
+ ]
41
+ ]
42
+ df = df.sort_values(by="score", ascending=False)
43
+
44
+ # Convert DataFrame for Dashboard
45
+ df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
46
+ df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
47
+ df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
48
+ df = df[df["first_commit"] >= "2009-01-01"]
49
+ df = df[df["latest_commit"] >= "2009-01-01"]
50
+ df["str_languages"] = df["languages"].apply(
51
+ lambda x: ",".join(x) if isinstance(x, list) else str(x)
52
  )
53
+ df["year"] = df["first_commit"].dt.year
54
 
55
+ dataset = load_dataset("taishi-i/nagisa_stopwords")
56
+ stopwords = dataset["nagisa_stopwords"]["words"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def tokenize_description(description):
59
+ tokens = nagisa.filter(description, filter_postags=["助詞", "εŠ©ε‹•θ©ž"])
60
+ words = tokens.words
61
+ words = [word for word in words if len(word.strip()) > 0]
62
+ words = [word for word in words if word not in stopwords]
63
+ words = " ".join(words)
64
+ return words
65
 
66
+ df["tokenized_description"] = df["description"].apply(tokenize_description)
67
+ return df
 
 
 
 
68
 
 
 
69
 
70
+ def main():
71
+ # Set streamlit page settings
72
+ title = "Awesome Japanese NLP resources Dashboard"
73
+ icon = "πŸ”Ž"
 
 
 
74
 
75
+ st.set_page_config(
76
+ page_title=title,
77
+ page_icon=icon,
78
+ layout="wide",
79
+ initial_sidebar_state="expanded",
80
  )
81
+ df = convert_to_dataframe()
82
+
83
+ # Main streamlit page (sidebar)
84
+ alt.themes.enable("dark")
85
+ with st.sidebar:
86
+ st.title(f"{title} {icon}")
87
+ st.markdown(
88
+ "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
89
+ )
90
+ query = st.text_input(label="Search keyword")
91
 
92
+ source_type = ["GitHub", "Hugging Face"]
93
+ selected_source_type = st.selectbox(
94
+ "Choose a source type: GitHub or Hugging Face", source_type
95
+ )
96
 
97
+ # Filtering GitHub or Hugging Face
98
+ df = df[df["source"] == selected_source_type]
99
+
100
+ if selected_source_type == "GitHub":
101
+ selected_model_or_dataset = None
102
+ all_languages = (
103
+ df["languages"]
104
+ .dropna()
105
+ .apply(lambda x: x if isinstance(x, list) else [])
106
+ .explode()
107
+ .unique()
108
+ )
109
+ all_languages = [""] + all_languages.tolist()
110
+ selected_languges = st.selectbox(
111
+ "Choose a programming language", all_languages, index=0
112
+ )
113
+
114
+ min_stars = int(df["stargazers_count"].min())
115
+ max_stars = int(df["stargazers_count"].max())
116
+
117
+ stars_range = st.slider(
118
+ "Choose the range for the stargazer count",
119
+ min_value=min_stars,
120
+ max_value=max_stars,
121
+ value=(min_stars, max_stars),
122
+ )
123
+ else:
124
+ selected_languges = None
125
+ selected_model_or_dataset = st.selectbox(
126
+ "Choose a model or a dataset",
127
+ ["", "model", "dataset"],
128
+ index=0,
129
+ )
130
+
131
+ min_downloads = int(df["downloads"].min())
132
+ max_downloads = int(df["downloads"].max())
133
+
134
+ downloads_range = st.slider(
135
+ "Choose the range for the number of downloads",
136
+ min_value=min_downloads,
137
+ max_value=max_downloads,
138
+ value=(min_downloads, max_downloads),
139
+ )
140
+
141
+ min_activity_period = int(df["activity_period"].min())
142
+ max_activity_period = int(df["activity_period"].max())
143
+
144
+ activity_period_range = st.slider(
145
+ "Select the range for activity periods (in days)",
146
+ min_value=min_activity_period,
147
+ max_value=max_activity_period,
148
+ value=(min_activity_period, max_activity_period),
149
+ )
150
+ years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
151
 
152
+ selected_year_range = st.slider(
153
+ "Select a range for the years of the first commit",
154
+ min_value=min(years),
155
+ max_value=max(years),
156
+ value=(min(years), max(years)),
157
+ )
158
 
 
 
 
 
 
 
159
  df = df[
160
+ (df["year"] >= selected_year_range[0])
161
+ & (df["year"] <= selected_year_range[1])
162
  ]
163
 
164
+ if selected_source_type == "GitHub":
165
+ df = df[
166
+ (df["stargazers_count"] >= stars_range[0])
167
+ & (df["stargazers_count"] <= stars_range[1])
168
+ ]
169
+ else:
170
+ df = df[
171
+ (df["downloads"] >= downloads_range[0])
172
+ & (df["downloads"] <= downloads_range[1])
173
+ ]
 
 
 
 
 
 
 
 
 
174
 
 
175
  df = df[
176
+ (df["activity_period"] >= activity_period_range[0])
177
+ & (df["activity_period"] <= activity_period_range[1])
 
178
  ]
179
 
180
+ contained_description = df["description"].str.contains(
181
+ query, case=False, na=False
 
 
 
 
 
 
 
 
 
182
  )
183
+ contained_project_name = df["project_name"].str.contains(
184
+ query, case=False, na=False
 
 
 
 
 
 
 
 
 
 
 
 
185
  )
186
+ df = df[contained_description | contained_project_name]
187
+
188
+ if selected_languges:
189
+ df = df[
190
+ df["str_languages"].str.contains(
191
+ selected_languges, case=False, na=False
192
+ )
193
+ ]
194
+
195
+ if selected_model_or_dataset:
196
+ df = df[
197
+ df["model_or_dataset"].str.contains(
198
+ selected_model_or_dataset, case=False, na=False
199
+ )
200
+ ]
201
+
202
+ # Main streamlit page (columns)
203
+ col1, col2 = st.columns(2, gap="large")
204
+
205
+ with col1:
206
+ st.markdown("### DataFrame")
207
+ st.markdown(f"#### Number of repositories: {len(df)}")
208
+ if selected_source_type == "GitHub":
209
+ stats_key = "stargazers_count"
210
+ else:
211
+ stats_key = "downloads"
212
+
213
+ if len(df) > 0:
214
+ mean_value = int(df[stats_key].mean())
215
+ min_value = int(df[stats_key].min())
216
+ max_value = int(df[stats_key].max())
217
+ st.markdown(
218
+ f"#### {stats_key} mean: {int(mean_value)}, min: {min_value}, max: {max_value}"
219
+ )
220
+
221
+ st.dataframe(df, height=600)
222
+
223
+ if len(df) > 0:
224
+ st.markdown("### Word Cloud")
225
+ descriptions = df["tokenized_description"].tolist()
226
+ combined_text = " ".join(descriptions)
227
+
228
+ wordcloud = WordCloud(
229
+ width=800,
230
+ height=400,
231
+ font_path=japanize_matplotlib.get_font_ttf_path(),
232
+ max_words=50,
233
+ colormap="PuBu",
234
+ ).generate(combined_text)
235
+
236
+ fig, ax = plt.subplots()
237
+ ax.imshow(wordcloud, interpolation="bilinear")
238
+ ax.axis("off")
239
+ st.pyplot(fig, use_container_width=True)
240
+
241
+ with col2:
242
+ if selected_source_type == "GitHub":
243
+ vs_type = "stargazers_count"
244
+ else:
245
+ vs_type = "downloads"
246
+
247
+ st.markdown(f"### First commit vs {vs_type}")
248
+ chart = (
249
+ alt.Chart(df)
250
+ .mark_circle(size=60)
251
+ .encode(
252
+ x="first_commit:T",
253
+ y=f"{vs_type}:Q",
254
+ tooltip=["first_commit", "project_name", f"{vs_type}"],
255
+ )
256
+ .properties(
257
+ title=f"Relationship between first commit date and {vs_type}",
258
+ )
259
+ .interactive()
260
  )
261
+ st.altair_chart(chart, use_container_width=True)
262
+
263
+ st.markdown(f"### Latest commit vs {vs_type}")
264
+ chart = (
265
+ alt.Chart(df)
266
+ .mark_circle(size=60)
267
+ .encode(
268
+ x="latest_commit:T",
269
+ y=f"{vs_type}:Q",
270
+ tooltip=["project_name", "latest_commit", f"{vs_type}"],
271
+ )
272
+ .properties(
273
+ title=f"Relationship between latest commit date and {vs_type}",
274
+ )
275
+ .interactive()
276
  )
277
+ st.altair_chart(chart, use_container_width=True)
278
+
279
+ st.markdown(f"### Activity period vs {vs_type}")
280
+ chart = (
281
+ alt.Chart(df)
282
+ .mark_circle(size=60)
283
+ .encode(
284
+ x=alt.X("activity_period:Q", title="Activity Period (Days)"),
285
+ y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
286
+ tooltip=[
287
+ "project_name",
288
+ "activity_period",
289
+ f"{vs_type}",
290
+ ],
291
+ )
292
+ .properties(
293
+ title=f"Relationship between activity period and {vs_type}",
294
+ )
295
+ .interactive()
296
  )
297
+ st.altair_chart(chart, use_container_width=True)
298
+
299
+ projects_per_year = (
300
+ df.groupby("year").size().reset_index(name="project_count")
 
 
 
 
 
 
 
 
 
 
 
 
301
  )
302
+
303
+ chart = (
304
+ alt.Chart(projects_per_year)
305
+ .mark_bar()
306
+ .encode(
307
+ x=alt.X("year:O", title="Year"),
308
+ y=alt.Y("project_count:Q", title="Number of repositories"),
309
+ tooltip=["year", "project_count"],
310
+ )
311
+ .properties(
312
+ title="Number of projects per year based on the uear of the first commit",
313
+ width=600,
314
+ height=400,
315
+ )
316
  )
317
+
318
+ st.altair_chart(chart, use_container_width=True)
319
+
320
+
321
+ if __name__ == "__main__":
322
+ main()
requirements.txt CHANGED
@@ -2,3 +2,8 @@ streamlit
2
  pandas
3
  altair
4
  plotly
 
 
 
 
 
 
2
  pandas
3
  altair
4
  plotly
5
+ matplotlib
6
+ nagisa
7
+ datasets
8
+ wordcloud
9
+ japanize_matplotlib