taishi-i commited on
Commit
49c8013
1 Parent(s): 3c9e152

add application files

Browse files
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import altair as alt
4
+ import pandas as pd
5
+ import streamlit as st
6
+
7
+
8
+ def read_json(file_name):
9
+ with open(file_name, "r") as f:
10
+ json_data = json.load(f)
11
+ return json_data
12
+
13
+
14
+ # Load a json file
15
+ json_file = "awesome-japanese-nlp-resources-search.json"
16
+ json_data = read_json(json_file)
17
+ df = pd.DataFrame(json_data)
18
+
19
+ # Sorted by selected columns
20
+ df = df[
21
+ [
22
+ "project_name",
23
+ "description",
24
+ "url",
25
+ "stargazers_count",
26
+ "downloads",
27
+ "source",
28
+ "score",
29
+ "first_commit",
30
+ "latest_commit",
31
+ "languages",
32
+ "model_or_dataset",
33
+ ]
34
+ ]
35
+ df = df.sort_values(by="score", ascending=False)
36
+
37
+
38
+ # Convert DataFrame for Dashboard
39
+ df["first_commit"] = pd.to_datetime(df["first_commit"], errors="coerce")
40
+ df["latest_commit"] = pd.to_datetime(df["latest_commit"], errors="coerce")
41
+ df["activity_period"] = (df["latest_commit"] - df["first_commit"]).dt.days
42
+ df = df[df["first_commit"] >= "2009-01-01"]
43
+ df = df[df["latest_commit"] >= "2009-01-01"]
44
+ df["str_languages"] = df["languages"].apply(
45
+ lambda x: ",".join(x) if isinstance(x, list) else str(x)
46
+ )
47
+ df["year"] = df["first_commit"].dt.year
48
+
49
+
50
+ # Set streamlit page settings
51
+ title = "Awesome Japanese NLP resources Dashboard"
52
+ icon = "🔎"
53
+
54
+ st.set_page_config(
55
+ page_title=title,
56
+ page_icon=icon,
57
+ layout="wide",
58
+ initial_sidebar_state="expanded",
59
+ )
60
+
61
+ # Main streamlit page (sidebar)
62
+ alt.themes.enable("dark")
63
+ with st.sidebar:
64
+ st.title(f"{title} {icon}")
65
+ st.markdown(
66
+ "You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources)."
67
+ )
68
+ query = st.text_input(label="Search keyword")
69
+
70
+ source_type = ["GitHub", "Hugging Face"]
71
+ selected_source_type = st.selectbox(
72
+ "Choose a source type: GitHub or Hugging Face", source_type
73
+ )
74
+
75
+ # Filtering GitHub or Hugging Face
76
+ df = df[df["source"] == selected_source_type]
77
+
78
+ if selected_source_type == "GitHub":
79
+ selected_model_or_dataset = None
80
+ all_languages = (
81
+ df["languages"]
82
+ .dropna()
83
+ .apply(lambda x: x if isinstance(x, list) else [])
84
+ .explode()
85
+ .unique()
86
+ )
87
+ all_languages = [""] + all_languages.tolist()
88
+ selected_languges = st.selectbox(
89
+ "Choose a programming language", all_languages, index=0
90
+ )
91
+
92
+ min_stars = int(df["stargazers_count"].min())
93
+ max_stars = int(df["stargazers_count"].max())
94
+
95
+ stars_range = st.slider(
96
+ "Choose the range for the stargazer count",
97
+ min_value=min_stars,
98
+ max_value=max_stars,
99
+ value=(min_stars, max_stars),
100
+ )
101
+ else:
102
+ selected_languges = None
103
+ selected_model_or_dataset = st.selectbox(
104
+ "Choose a model or a dataset", ["", "model", "dataset"], index=0
105
+ )
106
+
107
+ min_downloads = int(df["downloads"].min())
108
+ max_downloads = int(df["downloads"].max())
109
+
110
+ downloads_range = st.slider(
111
+ "Choose the range for the number of downloads",
112
+ min_value=min_downloads,
113
+ max_value=max_downloads,
114
+ value=(min_downloads, max_downloads),
115
+ )
116
+
117
+ min_activity_period = int(df["activity_period"].min())
118
+ max_activity_period = int(df["activity_period"].max())
119
+
120
+ activity_period_range = st.slider(
121
+ "Select the range for activity periods (in days)",
122
+ min_value=min_activity_period,
123
+ max_value=max_activity_period,
124
+ value=(min_activity_period, max_activity_period),
125
+ )
126
+ years = sorted(list(set(df["year"].dropna().astype(int).tolist())))
127
+
128
+ selected_year_range = st.slider(
129
+ "Select a range for the years of the first commit",
130
+ min_value=min(years),
131
+ max_value=max(years),
132
+ value=(min(years), max(years)),
133
+ )
134
+
135
+
136
+ df = df[
137
+ (df["year"] >= selected_year_range[0])
138
+ & (df["year"] <= selected_year_range[1])
139
+ ]
140
+
141
+
142
+ if selected_source_type == "GitHub":
143
+ df = df[
144
+ (df["stargazers_count"] >= stars_range[0])
145
+ & (df["stargazers_count"] <= stars_range[1])
146
+ ]
147
+ else:
148
+ df = df[
149
+ (df["downloads"] >= downloads_range[0])
150
+ & (df["downloads"] <= downloads_range[1])
151
+ ]
152
+
153
+ df = df[
154
+ (df["activity_period"] >= activity_period_range[0])
155
+ & (df["activity_period"] <= activity_period_range[1])
156
+ ]
157
+
158
+ contained_description = df["description"].str.contains(
159
+ query, case=False, na=False
160
+ )
161
+ contained_project_name = df["project_name"].str.contains(
162
+ query, case=False, na=False
163
+ )
164
+ df = df[contained_description | contained_project_name]
165
+
166
+ if selected_languges:
167
+ df = df[
168
+ df["str_languages"].str.contains(
169
+ selected_languges, case=False, na=False
170
+ )
171
+ ]
172
+
173
+ if selected_model_or_dataset:
174
+ df = df[
175
+ df["model_or_dataset"].str.contains(
176
+ selected_model_or_dataset, case=False, na=False
177
+ )
178
+ ]
179
+
180
+
181
+ # Main streamlit page (columns)
182
+ col1, col2 = st.columns(2, gap="large")
183
+
184
+ with col1:
185
+ st.markdown("### DataFrame")
186
+ st.markdown(f"#### Number of repositories: {len(df)}")
187
+ st.dataframe(df, height=600)
188
+
189
+ projects_per_year = (
190
+ df.groupby("year").size().reset_index(name="project_count")
191
+ )
192
+
193
+ chart = (
194
+ alt.Chart(projects_per_year)
195
+ .mark_bar()
196
+ .encode(
197
+ x=alt.X("year:O", title="Year"),
198
+ y=alt.Y("project_count:Q", title="Number of repositories"),
199
+ tooltip=["year", "project_count"],
200
+ )
201
+ .properties(
202
+ title="Number of projects per year based on the uear of the first commit",
203
+ width=600,
204
+ height=400,
205
+ )
206
+ )
207
+
208
+ st.altair_chart(chart, use_container_width=True)
209
+
210
+ with col2:
211
+ if selected_source_type == "GitHub":
212
+ vs_type = "stargazers_count"
213
+ else:
214
+ vs_type = "downloads"
215
+
216
+ st.markdown(f"### First commit vs {vs_type}")
217
+ chart = (
218
+ alt.Chart(df)
219
+ .mark_circle(size=60)
220
+ .encode(
221
+ x="first_commit:T",
222
+ y=f"{vs_type}:Q",
223
+ tooltip=["first_commit", "project_name", f"{vs_type}"],
224
+ )
225
+ .properties(
226
+ title=f"Relationship between first commit date and {vs_type}",
227
+ )
228
+ .interactive()
229
+ )
230
+ st.altair_chart(chart, use_container_width=True)
231
+
232
+ st.markdown(f"### Latest commit vs {vs_type}")
233
+ chart = (
234
+ alt.Chart(df)
235
+ .mark_circle(size=60)
236
+ .encode(
237
+ x="latest_commit:T",
238
+ y=f"{vs_type}:Q",
239
+ tooltip=["project_name", "latest_commit", f"{vs_type}"],
240
+ )
241
+ .properties(
242
+ title=f"Relationship between latest commit date and {vs_type}",
243
+ )
244
+ .interactive()
245
+ )
246
+ st.altair_chart(chart, use_container_width=True)
247
+
248
+ st.markdown(f"### Activity period vs {vs_type}")
249
+ chart = (
250
+ alt.Chart(df)
251
+ .mark_circle(size=60)
252
+ .encode(
253
+ x=alt.X("activity_period:Q", title="Activity Period (Days)"),
254
+ y=alt.Y(f"{vs_type}:Q", title=f"{vs_type}"),
255
+ tooltip=[
256
+ "project_name",
257
+ "activity_period",
258
+ f"{vs_type}",
259
+ ],
260
+ )
261
+ .properties(
262
+ title=f"Relationship between activity period and {vs_type}",
263
+ )
264
+ .interactive()
265
+ )
266
+ st.altair_chart(chart, use_container_width=True)
awesome-japanese-nlp-resources-search.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ altair
4
+ plotly