AleksanderObuchowski commited on
Commit
7bb750a
1 Parent(s): f3c17dc

get some swaaaag

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. .streamlit/config.toml +9 -0
  3. Eskulap.png +0 -0
  4. app.py +143 -34
  5. requirements.txt +3 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ .venv
.streamlit/config.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [client]
2
+ showErrorDetails = false
3
+
4
+ [theme]
5
+ primaryColor="#70a99f"
6
+ backgroundColor="#212c2a"
7
+ secondaryBackgroundColor="#415854"
8
+ textColor="#70a99f"
9
+ font="monospace"
Eskulap.png ADDED
app.py CHANGED
@@ -4,21 +4,61 @@ from typing import Dict, List
4
 
5
  import plotly.express as px
6
  import streamlit as st
 
7
 
8
  from datasets import Dataset, get_dataset_infos, load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  BASE_DATASET: str = "lion-ai/pl_med_data"
11
  read_key = os.environ.get('HF_TOKEN', None)
12
 
13
-
14
- dataset_names_map: Dict[str, str] = {
15
- "znany_lekarz": "Porady - pytania i odpowiedzi",
16
- "kor_epikryzy_qa": "Dokumentacja medyczna - pytania i odpowiedzi",
17
- "wikipedia": "Ogólna wiedza medyczna - pytania i opowiedzi",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
 
 
 
20
  reverse_dataset_names_map: Dict[str, str] = {v: k for k, v in dataset_names_map.items()}
21
 
 
 
 
22
 
23
  @st.cache_resource
24
  def list_datasets() -> Dict[str, Dataset]:
@@ -34,8 +74,8 @@ def list_datasets() -> Dict[str, Dataset]:
34
  def show_examples(dataset_name: str, split: str) -> None:
35
  dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name)
36
 
37
- dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:10]", token=read_key)
38
- st.data_editor(dataset.to_pandas(), use_container_width=True)
39
 
40
 
41
  def count_all_examples(datasets: Dict[str, Dataset]) -> None:
@@ -64,32 +104,101 @@ def filter_splits(dataset: Dict[str, Dataset], split: str) -> Dict[str, Dataset]
64
  dataset_splits[dataset_name] = dataset_info.splits[split]
65
  return dataset_splits
66
 
 
 
 
67
 
68
- split: str = st.selectbox("splits", ["raw", "processed"])
69
-
70
- datasets: Dict[str, Dataset] = list_datasets()
71
- # st.write(datasets)
72
-
73
- filtered_datasets: Dict[str, Dataset] = filter_splits(datasets, split)
74
- # st.write(filtered_datasets)
75
-
76
- count_all_examples(filtered_datasets)
77
-
78
- # Create a pie chart showing the number of examples per dataset
79
- fig = px.pie(
80
- values=[split.num_examples for split in filtered_datasets.values()],
81
- names=list(filtered_datasets.keys()),
82
- # title=f"Number of Examples per Dataset ({split} split)",
83
- labels={"label": "Dataset", "value": "Number of Examples"},
84
- )
85
-
86
- # Update layout for better readability
87
- fig.update_traces(textposition="inside", textinfo="value+label")
88
- fig.update_layout(legend_title_text="Datasets", uniformtext_minsize=12, uniformtext_mode="hide")
89
-
90
- chart = st.plotly_chart(fig, use_container_width=True)
91
-
92
-
93
- dataset_name = st.selectbox("Select a dataset", list(filtered_datasets.keys()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- show_examples(dataset_name, split)
 
4
 
5
  import plotly.express as px
6
  import streamlit as st
7
+ import pandas as pd
8
 
9
  from datasets import Dataset, get_dataset_infos, load_dataset
10
+ import stanza
11
+
12
+ import matplotlib.pyplot as plt
13
+ from wordcloud import WordCloud
14
+ import io
15
+
16
+ st.set_page_config(
17
+ page_title="Eskulap Dataset",
18
+ page_icon="🩺",
19
+ layout="wide",
20
+ initial_sidebar_state="expanded",
21
+ )
22
+
23
 
24
  BASE_DATASET: str = "lion-ai/pl_med_data"
25
  read_key = os.environ.get('HF_TOKEN', None)
26
 
27
+ datasets_map = {
28
+ "znany_lekarz":
29
+ {
30
+ "display_name": "Porady",
31
+ "description": "Zbiór pytań i odpowiedzi odnośnie medycyny.",
32
+ "primary_column": "question",
33
+ },
34
+ "kor_epikryzy_qa":
35
+ {
36
+ "display_name": "Dokumentacja - QA",
37
+ "description": "Zbiór pytań i odpowiedzi do zanonimizowanej dokumentacji medycznej.",
38
+ "primary_column": "content",
39
+ },
40
+ "wikipedia":
41
+ {
42
+ "display_name": "Wikipedia",
43
+ "description": "Zbiór pytań i odpowiedzi na podstawie artykułów z Wikipedii.",
44
+ "primary_column": "question",
45
+ },
46
+ "ulotki_medyczne":
47
+ {
48
+ "display_name": "Pytania farmaceutyczne",
49
+ "description": "Zbiór pytań i odpowiedzi na podstawie ulotek medycznych.",
50
+ "primary_column": "question",
51
+ },
52
  }
53
 
54
+
55
+ dataset_names_map: Dict[str, str] = {k: v["display_name"] for k, v in datasets_map.items()}
56
+
57
  reverse_dataset_names_map: Dict[str, str] = {v: k for k, v in dataset_names_map.items()}
58
 
59
+ @st.cache_resource
60
+ def load_stanza_pipeline():
61
+ return stanza.Pipeline(lang='pl', processors='tokenize,mwt,pos,lemma')
62
 
63
  @st.cache_resource
64
  def list_datasets() -> Dict[str, Dataset]:
 
74
  def show_examples(dataset_name: str, split: str) -> None:
75
  dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name)
76
 
77
+ dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:50]", token=read_key)
78
+ st.data_editor(dataset.to_pandas(), use_container_width=True, height=900)
79
 
80
 
81
  def count_all_examples(datasets: Dict[str, Dataset]) -> None:
 
104
  dataset_splits[dataset_name] = dataset_info.splits[split]
105
  return dataset_splits
106
 
107
+ @st.cache_data(show_spinner=False)
108
+ def generate_wordcloud(dataset_name, split):
109
+ dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name)
110
 
111
+ dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:500]", token=read_key)
112
+
113
+ primary_column = datasets_map[dataset_name]["primary_column"]
114
+
115
+ text = ""
116
+ progress_bar = st.progress(0, text = "Generating wordcloud...")
117
+ for i, example in enumerate(dataset[primary_column]):
118
+ doc = stanza_pipeline(example)
119
+ nouns = [word.lemma for sent in doc.sentences for word in sent.words if word.upos == 'NOUN']
120
+ text += " ".join(nouns) + " "
121
+ progress_bar.progress((i + 1) / len(dataset[primary_column]), text = f"Generating wordcloud...")
122
+
123
+ wordcloud = WordCloud(width=600, height=600, background_color='#212c2a', colormap="Greens", contour_width=0, contour_color="#212c2a").generate(text)
124
+ progress_bar.empty()
125
+
126
+ plt.figure(figsize=(6, 6), facecolor='#212c2a')
127
+ plt.imshow(wordcloud, interpolation='bilinear')
128
+ plt.axis('off')
129
+ plt.tight_layout(pad=0)
130
+
131
+ # Save the plot to a bytes buffer
132
+ buf = io.BytesIO()
133
+ plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0, facecolor='#212c2a')
134
+ buf.seek(0)
135
+
136
+ # Display the image in Streamlit
137
+ st.image(buf, use_column_width=True)
138
+
139
+
140
+ _, col, _ = st.columns([1, 2, 1])
141
+
142
+ with col:
143
+ split: str = "processed"
144
+
145
+ datasets: Dict[str, Dataset] = list_datasets()
146
+ stanza_pipeline = load_stanza_pipeline()
147
+ # st.write(datasets)
148
+
149
+ filtered_datasets: Dict[str, Dataset] = filter_splits(datasets, split)
150
+ # st.write(filtered_datasets)
151
+ image = st.image("Eskulap.png", use_column_width=True)
152
+
153
+ count_all_examples(filtered_datasets)
154
+
155
+ distribution = {
156
+ "dataset": list(filtered_datasets.keys()),
157
+ "count": [split.num_examples for split in filtered_datasets.values()],
158
+ }
159
+
160
+ distribution_df = pd.DataFrame(distribution)
161
+
162
+ # Create a pie chart showing the number of examples per dataset
163
+ fig = px.pie(
164
+ distribution_df,
165
+ names="dataset",
166
+ values="count",
167
+ hover_name="dataset",
168
+ title=f"Data distribution",
169
+ labels={"label": "Dataset", "value": "Number of Examples"},
170
+ color_discrete_sequence=px.colors.sequential.Blugrn,
171
+ hole=0.3,
172
+ )
173
+
174
+ # Update layout for better readability
175
+ # fig.update_traces(textposition="inside", textinfo="value+label")
176
+ fig.update_traces(textposition='none')
177
+ fig.update_layout(legend_title_text="Datasets", uniformtext_minsize=12, uniformtext_mode="hide")
178
+
179
+ chart = st.plotly_chart(fig, use_container_width=True)
180
+
181
+
182
+ dataset_name = st.selectbox("Select a dataset", list(filtered_datasets.keys()))
183
+ st.write(f"### {dataset_name}")
184
+ st.write(datasets_map[reverse_dataset_names_map.get(dataset_name)]["description"])
185
+ st.markdown("***")
186
+ col1, col2 = st.columns(2)
187
+ with col1:
188
+ st.write(f"### Sample data")
189
+ show_examples(dataset_name, split)
190
+
191
+
192
+ with col2:
193
+ st.write(f"### Wordcloud")
194
+ generate_wordcloud(dataset_name, split)
195
+
196
+ _, col, _ = st.columns([1, 2, 1])
197
+
198
+
199
+ with col:
200
+ st.button("Made with ❤️ by thelion.ai", use_container_width=True, disabled=True)
201
+ st.write("Intersted in the project? Contact us : [email protected]")
202
+
203
+
204
 
 
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- plotly==5.23.0
 
 
 
1
+ plotly==5.23.0
2
+ wordcloud==1.9.3
3
+ stanze==1.8.2