prashant
commited on
Commit
•
72e4dad
1
Parent(s):
49a314a
ver0.2 appstore update
Browse files- appStore/info.py +8 -1
- appStore/keyword_search.py +114 -490
- appStore/multiapp.py +33 -8
- appStore/sdg_analysis.py +113 -230
- sample/keywordexample.json +7 -0
appStore/info.py
CHANGED
@@ -2,6 +2,13 @@ import streamlit as st
|
|
2 |
|
3 |
|
4 |
def app():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
with open('style.css') as f:
|
6 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
7 |
footer = """
|
@@ -33,7 +40,7 @@ The collaboration aims to determine the potential of NLP methods for tracking po
|
|
33 |
"""
|
34 |
st.markdown(intro, unsafe_allow_html=True)
|
35 |
st.image("appStore/img/pic1.png", caption="NDC Coherence")
|
36 |
-
st.subheader("Methodology")
|
37 |
#st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
|
38 |
# "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
|
39 |
# "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
|
|
|
2 |
|
3 |
|
4 |
def app():
|
5 |
+
# if 'file' in st.session_state:
|
6 |
+
# file = st.session_state['file']
|
7 |
+
# else:
|
8 |
+
# st.sidebar.markdown(" :cloud: Upload document ")
|
9 |
+
# uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
|
10 |
+
# st.session_state['file'] = uploaded_file
|
11 |
+
|
12 |
with open('style.css') as f:
|
13 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
14 |
footer = """
|
|
|
40 |
"""
|
41 |
st.markdown(intro, unsafe_allow_html=True)
|
42 |
st.image("appStore/img/pic1.png", caption="NDC Coherence")
|
43 |
+
#st.subheader("Methodology")
|
44 |
#st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
|
45 |
# "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
|
46 |
# "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
|
appStore/keyword_search.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
# set path
|
2 |
-
import glob, os, sys
|
|
|
|
|
3 |
|
4 |
#import helper
|
5 |
import udfPreprocess.docPreprocessing as pre
|
6 |
import udfPreprocess.cleaning as clean
|
7 |
-
|
8 |
#import needed libraries
|
9 |
import seaborn as sns
|
10 |
from pandas import DataFrame
|
@@ -24,20 +26,24 @@ import docx
|
|
24 |
from docx.shared import Inches
|
25 |
from docx.shared import Pt
|
26 |
from docx.enum.style import WD_STYLE_TYPE
|
27 |
-
|
|
|
28 |
import tempfile
|
29 |
import sqlite3
|
|
|
|
|
|
|
30 |
|
31 |
def app():
|
32 |
|
33 |
with st.container():
|
34 |
st.markdown("<h1 style='text-align: center; \
|
35 |
-
color: black;'>
|
36 |
unsafe_allow_html=True)
|
37 |
st.write(' ')
|
38 |
st.write(' ')
|
39 |
|
40 |
-
with st.expander("ℹ️ - About this app", expanded=
|
41 |
|
42 |
st.write(
|
43 |
"""
|
@@ -45,498 +51,116 @@ def app():
|
|
45 |
built in Streamlit for doing keyword search in \
|
46 |
policy document - developed by GIZ Data and the \
|
47 |
Sustainable Development Solution Network.
|
48 |
-
"""
|
49 |
-
)
|
50 |
|
51 |
st.markdown("")
|
52 |
-
|
53 |
-
|
54 |
-
st.markdown("### 📌 Step One: Upload document ### ")
|
55 |
-
|
56 |
-
with st.container():
|
57 |
-
def bm25_tokenizer(text):
|
58 |
-
tokenized_doc = []
|
59 |
-
for token in text.lower().split():
|
60 |
-
token = token.strip(string.punctuation)
|
61 |
-
|
62 |
-
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
63 |
-
tokenized_doc.append(token)
|
64 |
-
return tokenized_doc
|
65 |
-
|
66 |
-
def bm25TokenizeDoc(paraList):
|
67 |
-
tokenized_corpus = []
|
68 |
-
for passage in tqdm(paraList):
|
69 |
-
if len(passage.split()) >256:
|
70 |
-
temp = " ".join(passage.split()[:256])
|
71 |
-
tokenized_corpus.append(bm25_tokenizer(temp))
|
72 |
-
temp = " ".join(passage.split()[256:])
|
73 |
-
tokenized_corpus.append(bm25_tokenizer(temp))
|
74 |
-
else:
|
75 |
-
tokenized_corpus.append(bm25_tokenizer(passage))
|
76 |
-
|
77 |
-
return tokenized_corpus
|
78 |
-
def search(keyword):
|
79 |
-
##### BM25 search (lexical search) #####
|
80 |
-
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
81 |
-
top_n = np.argpartition(bm25_scores, -10)[-10:]
|
82 |
-
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
83 |
-
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
84 |
-
|
85 |
-
##### Sematic Search #####
|
86 |
-
# Encode the query using the bi-encoder and find potentially relevant passages
|
87 |
-
#query = "Does document contain {} issues ?".format(keyword)
|
88 |
-
question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
89 |
-
|
90 |
-
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
91 |
-
hits = hits[0] # Get the hits for the first query
|
92 |
-
|
93 |
-
|
94 |
-
##### Re-Ranking #####
|
95 |
-
# Now, score all retrieved passages with the cross_encoder
|
96 |
-
#cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
97 |
-
#cross_scores = cross_encoder.predict(cross_inp)
|
98 |
-
|
99 |
-
# Sort results by the cross-encoder scores
|
100 |
-
#for idx in range(len(cross_scores)):
|
101 |
-
# hits[idx]['cross-score'] = cross_scores[idx]
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
def show_results(keywordList):
|
107 |
-
document = docx.Document()
|
108 |
-
document.add_heading('Document name:{}'.format(file_name), 2)
|
109 |
-
section = document.sections[0]
|
110 |
-
|
111 |
-
# Calling the footer
|
112 |
-
footer = section.footer
|
113 |
-
|
114 |
-
# Calling the paragraph already present in
|
115 |
-
# the footer section
|
116 |
-
footer_para = footer.paragraphs[0]
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
para.font.size = Pt(12)
|
130 |
-
bm25_hits, hits = search(keyword)
|
131 |
-
|
132 |
-
st.markdown("""
|
133 |
-
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
134 |
-
""")
|
135 |
-
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
136 |
-
st.markdown("Top few lexical search (BM25) hits")
|
137 |
-
document.add_paragraph("Top few lexical search (BM25) hits")
|
138 |
-
|
139 |
-
for hit in bm25_hits[0:5]:
|
140 |
-
if hit['score'] > 0.00:
|
141 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
142 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
# st.table(bm25_hits[0:3])
|
147 |
-
|
148 |
-
st.markdown("\n-------------------------\n")
|
149 |
-
st.markdown("Top few Bi-Encoder Retrieval hits")
|
150 |
-
document.add_paragraph("\n-------------------------\n")
|
151 |
-
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
152 |
-
|
153 |
-
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
154 |
-
for hit in hits[0:5]:
|
155 |
-
# if hit['score'] > 0.45:
|
156 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
157 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
158 |
-
#st.table(hits[0:3]
|
159 |
-
document.save('demo.docx')
|
160 |
-
with open("demo.docx", "rb") as file:
|
161 |
-
btn = st.download_button(
|
162 |
-
label="Download file",
|
163 |
-
data=file,
|
164 |
-
file_name="demo.docx",
|
165 |
-
mime="txt/docx"
|
166 |
-
)
|
167 |
-
|
168 |
-
|
169 |
-
@st.cache(allow_output_mutation=True)
|
170 |
-
def load_sentenceTransformer(name):
|
171 |
-
return SentenceTransformer(name)
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
docs = None
|
176 |
-
# asking user for either upload or select existing doc
|
177 |
-
choice = st.radio(label = 'Select the Document',
|
178 |
-
help = 'You can upload the document \
|
179 |
-
or else you can try a example document',
|
180 |
-
options = ('Upload Document', 'Try Example'),
|
181 |
-
horizontal = True)
|
182 |
-
|
183 |
-
if choice == 'Upload Document':
|
184 |
-
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
185 |
-
if uploaded_file is not None:
|
186 |
-
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
187 |
-
bytes_data = uploaded_file.getvalue()
|
188 |
-
temp.write(bytes_data)
|
189 |
-
|
190 |
-
st.write("Uploaded Filename: ", uploaded_file.name)
|
191 |
-
file_name = uploaded_file.name
|
192 |
-
file_path = temp.name
|
193 |
-
docs = pre.load_document(file_path, file_name)
|
194 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
195 |
-
|
196 |
-
else:
|
197 |
-
# listing the options
|
198 |
-
option = st.selectbox('Select the example document',
|
199 |
-
('South Africa:Low Emission strategy',
|
200 |
-
'Ethiopia: 10 Year Development Plan'))
|
201 |
-
if option is 'South Africa:Low Emission strategy':
|
202 |
-
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
203 |
-
st.write("Selected document:", file_name.split('/')[1])
|
204 |
-
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
205 |
-
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
206 |
else:
|
207 |
-
|
208 |
-
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
209 |
-
st.write("Selected document:", file_name.split('/')[1])
|
210 |
|
211 |
-
|
212 |
-
docs = pre.load_document(file_path,file_name)
|
213 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
214 |
-
|
215 |
-
if docs is not None:
|
216 |
-
|
217 |
-
bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
218 |
-
bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
219 |
-
top_k = 32
|
220 |
-
|
221 |
-
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
222 |
-
tokenized_corpus = bm25TokenizeDoc(paraList)
|
223 |
-
document_bm25 = BM25Okapi(tokenized_corpus)
|
224 |
-
keywordList = None
|
225 |
-
|
226 |
-
col1, col2 = st.columns(2)
|
227 |
-
with col1:
|
228 |
-
if st.button('Climate Change Keyword Search'):
|
229 |
-
keywordList = ['extreme weather', 'floods', 'droughts']
|
230 |
-
|
231 |
-
# show_results(keywordList)
|
232 |
-
with col2:
|
233 |
-
if st.button('Gender Keywords Search'):
|
234 |
-
keywordList = ['Gender', 'Women empowernment']
|
235 |
-
|
236 |
-
# show_results(keywordList)
|
237 |
-
|
238 |
-
keyword = st.text_input("Please enter here \
|
239 |
-
what you want to search, \
|
240 |
-
we will look for similar context \
|
241 |
-
in the document.",
|
242 |
-
value="",)
|
243 |
-
if st.button("Find them."):
|
244 |
-
keywordList = [keyword]
|
245 |
-
if keywordList is not None:
|
246 |
|
247 |
-
show_results(keywordList)
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
# @st.cache(allow_output_mutation=True)
|
253 |
-
# def load_sentenceTransformer(name):
|
254 |
-
# return SentenceTransformer(name)
|
255 |
-
|
256 |
-
# bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
257 |
-
# bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
258 |
-
# top_k = 32
|
259 |
-
|
260 |
-
# #@st.cache(allow_output_mutation=True)
|
261 |
-
# #def load_crossEncoder(name):
|
262 |
-
# # return CrossEncoder(name)
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
-
# return tokenized_corpus
|
288 |
-
|
289 |
-
# tokenized_corpus = bm25TokenizeDoc(paraList)
|
290 |
-
|
291 |
-
|
292 |
-
# document_bm25 = BM25Okapi(tokenized_corpus)
|
293 |
-
|
294 |
-
# # def search(keyword):
|
295 |
-
# # ##### BM25 search (lexical search) #####
|
296 |
-
# # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
297 |
-
# top_n = np.argpartition(bm25_scores, -10)[-10:]
|
298 |
-
# bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
299 |
-
# bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
300 |
-
|
301 |
-
# ##### Sematic Search #####
|
302 |
-
# # Encode the query using the bi-encoder and find potentially relevant passages
|
303 |
-
# #query = "Does document contain {} issues ?".format(keyword)
|
304 |
-
# question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
305 |
-
|
306 |
-
# hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
307 |
-
# hits = hits[0] # Get the hits for the first query
|
308 |
-
|
309 |
-
|
310 |
-
# ##### Re-Ranking #####
|
311 |
-
# # Now, score all retrieved passages with the cross_encoder
|
312 |
-
# #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
313 |
-
# #cross_scores = cross_encoder.predict(cross_inp)
|
314 |
-
|
315 |
-
# # Sort results by the cross-encoder scores
|
316 |
-
# #for idx in range(len(cross_scores)):
|
317 |
-
# # hits[idx]['cross-score'] = cross_scores[idx]
|
318 |
-
|
319 |
-
|
320 |
-
# return bm25_hits, hits
|
321 |
-
|
322 |
-
# def show_results(keywordList):
|
323 |
-
# for keyword in keywordList:
|
324 |
-
# bm25_hits, hits = search(keyword)
|
325 |
-
|
326 |
-
# st.markdown("""
|
327 |
-
# We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
328 |
-
# """)
|
329 |
-
# # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
330 |
-
# st.markdown("Top few lexical search (BM25) hits")
|
331 |
-
# for hit in bm25_hits[0:5]:
|
332 |
-
# if hit['score'] > 0.00:
|
333 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
# # st.table(bm25_hits[0:3])
|
340 |
-
|
341 |
-
# st.markdown("\n-------------------------\n")
|
342 |
-
# st.markdown("Top few Bi-Encoder Retrieval hits")
|
343 |
-
|
344 |
-
# hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
345 |
-
# for hit in hits[0:5]:
|
346 |
-
# # if hit['score'] > 0.45:
|
347 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
348 |
-
# #st.table(hits[0:3]
|
349 |
-
|
350 |
-
|
351 |
-
# # if docs is not None:
|
352 |
-
# # col1, col2 = st.columns(2)
|
353 |
-
# # with col1:
|
354 |
-
# # if st.button('Gender Keywords Search'):
|
355 |
-
# # keywordList = ['Gender Equality', 'Women empowernment']
|
356 |
-
# # show_results(keywordList)
|
357 |
-
# # with col2:
|
358 |
-
# # if st.button('Climate Change Keyword Search'):
|
359 |
-
# # keywordList = ['extreme weather', 'floods', 'droughts']
|
360 |
-
# # show_results(keywordList)
|
361 |
-
|
362 |
-
# # keyword = st.text_input("Please enter here \
|
363 |
-
# # what you want to search, \
|
364 |
-
# # we will look for similar context \
|
365 |
-
# # in the document.",
|
366 |
-
# # value="",)
|
367 |
-
# # if st.button("Find them."):
|
368 |
-
# # show_results([keyword])
|
369 |
-
|
370 |
-
|
371 |
-
# choice1 = st.radio(label = 'Keyword Search',
|
372 |
-
# help = 'Search \
|
373 |
-
# or else you can try a example document',
|
374 |
-
# options = ('Enter your own Query', 'Try Example'),
|
375 |
-
# horizontal = True)
|
376 |
-
|
377 |
-
# if choice1 == 'Enter your own Query':
|
378 |
-
# keyword = st.text_input("Please enter here \
|
379 |
-
# what you want to search, \
|
380 |
-
# we will look for similar context \
|
381 |
-
# in the document.",
|
382 |
-
# value="",)
|
383 |
-
# else:
|
384 |
-
# option1 = st.selectbox('Select the Predefined word cluster',
|
385 |
-
# ('Gender:[Gender Equality, Women empowernment]',
|
386 |
-
# 'Climate change:[extreme weather, floods, droughts]',
|
387 |
-
# ))
|
388 |
-
# if option1 == 'Gender:[Gender Equality, Women empowernment]':
|
389 |
-
# keywordList = ['Gender Equality', 'Women empowernment']
|
390 |
-
# else:
|
391 |
-
# keywordList = ['extreme weather', 'floods', 'droughts']
|
392 |
-
|
393 |
-
# option1 = st.selectbox('Select the Predefined word cluster',
|
394 |
-
# ('Gender:[Gender Equality, Women empowernment]',
|
395 |
-
# 'Climate change:[extreme weather, floods, droughts]',
|
396 |
-
# # 'Enter your Own Keyword Query'))
|
397 |
-
# if option1 == 'Enter your Own Keyword Query':
|
398 |
-
# keyword = st.text_input("Please enter here \
|
399 |
-
# what you want to search, \
|
400 |
-
# we will look for similar context \
|
401 |
-
# in the document.",
|
402 |
-
# value="",)
|
403 |
-
# elif option1 == 'Gender:[Gender Equality, Women empowernment]':
|
404 |
-
# keywordList = ['Gender Equality', 'Women empowernment']
|
405 |
-
# elif option1 == 'Climate change:[extreme weather, floods, droughts]':
|
406 |
-
# keywordList = ['extreme weather', 'floods', 'droughts']
|
407 |
-
|
408 |
-
|
409 |
-
# st.markdown("### 📌 Step Two: Search Keyword in Document ### ")
|
410 |
-
|
411 |
-
|
412 |
-
# @st.cache(allow_output_mutation=True)
|
413 |
-
# def load_sentenceTransformer(name):
|
414 |
-
# return SentenceTransformer(name)
|
415 |
-
|
416 |
-
# bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
417 |
-
# bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
418 |
-
# top_k = 32
|
419 |
-
|
420 |
-
# #@st.cache(allow_output_mutation=True)
|
421 |
-
# #def load_crossEncoder(name):
|
422 |
-
# # return CrossEncoder(name)
|
423 |
-
|
424 |
-
# # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
425 |
-
# document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
426 |
-
|
427 |
-
# def bm25_tokenizer(text):
|
428 |
-
# tokenized_doc = []
|
429 |
-
# for token in text.lower().split():
|
430 |
-
# token = token.strip(string.punctuation)
|
431 |
-
|
432 |
-
# if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
433 |
-
# tokenized_doc.append(token)
|
434 |
-
# return tokenized_doc
|
435 |
-
|
436 |
-
# def bm25TokenizeDoc(paraList):
|
437 |
-
# tokenized_corpus = []
|
438 |
-
# for passage in tqdm(paraList):
|
439 |
-
# if len(passage.split()) >256:
|
440 |
-
# temp = " ".join(passage.split()[:256])
|
441 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
442 |
-
# temp = " ".join(passage.split()[256:])
|
443 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
444 |
-
# else:
|
445 |
-
# tokenized_corpus.append(bm25_tokenizer(passage))
|
446 |
-
|
447 |
-
# return tokenized_corpus
|
448 |
-
|
449 |
-
# tokenized_corpus = bm25TokenizeDoc(paraList)
|
450 |
-
|
451 |
-
|
452 |
-
# document_bm25 = BM25Okapi(tokenized_corpus)
|
453 |
-
|
454 |
-
|
455 |
-
# def search(keyword):
|
456 |
-
# ##### BM25 search (lexical search) #####
|
457 |
-
# bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
458 |
-
# top_n = np.argpartition(bm25_scores, -10)[-10:]
|
459 |
-
# bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
460 |
-
# bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
461 |
|
462 |
-
|
463 |
-
# # Encode the query using the bi-encoder and find potentially relevant passages
|
464 |
-
# #query = "Does document contain {} issues ?".format(keyword)
|
465 |
-
# question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
466 |
-
|
467 |
-
# hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
468 |
-
# hits = hits[0] # Get the hits for the first query
|
469 |
-
|
470 |
-
|
471 |
-
# ##### Re-Ranking #####
|
472 |
-
# # Now, score all retrieved passages with the cross_encoder
|
473 |
-
# #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
474 |
-
# #cross_scores = cross_encoder.predict(cross_inp)
|
475 |
-
|
476 |
-
# # Sort results by the cross-encoder scores
|
477 |
-
# #for idx in range(len(cross_scores)):
|
478 |
-
# # hits[idx]['cross-score'] = cross_scores[idx]
|
479 |
-
|
480 |
-
|
481 |
-
# return bm25_hits, hits
|
482 |
-
|
483 |
-
# def show_results(keywordList):
|
484 |
-
# for keyword in keywordList:
|
485 |
-
# bm25_hits, hits = search(keyword)
|
486 |
-
|
487 |
-
# st.markdown("""
|
488 |
-
# We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
489 |
-
# """)
|
490 |
-
# # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
491 |
-
# st.markdown("Top few lexical search (BM25) hits")
|
492 |
-
# for hit in bm25_hits[0:5]:
|
493 |
-
# if hit['score'] > 0.00:
|
494 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
# # st.table(bm25_hits[0:3])
|
501 |
-
|
502 |
-
# st.markdown("\n-------------------------\n")
|
503 |
-
# st.markdown("Top few Bi-Encoder Retrieval hits")
|
504 |
-
|
505 |
-
# hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
506 |
-
# for hit in hits[0:5]:
|
507 |
-
# # if hit['score'] > 0.45:
|
508 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
509 |
-
# #st.table(hits[0:3]
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
# # if st.button("Find them."):
|
515 |
-
# # bm25_hits, hits = search(keyword)
|
516 |
-
|
517 |
-
# # st.markdown("""
|
518 |
-
# # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
519 |
-
# # """)
|
520 |
-
# # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
521 |
-
# # st.markdown("Top few lexical search (BM25) hits")
|
522 |
-
# # for hit in bm25_hits[0:5]:
|
523 |
-
# # if hit['score'] > 0.00:
|
524 |
-
# # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
# # # st.table(bm25_hits[0:3])
|
531 |
-
|
532 |
-
# # st.markdown("\n-------------------------\n")
|
533 |
-
# # st.markdown("Top few Bi-Encoder Retrieval hits")
|
534 |
-
|
535 |
-
# # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
536 |
-
# # for hit in hits[0:5]:
|
537 |
-
# # # if hit['score'] > 0.45:
|
538 |
-
# # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
539 |
-
# # #st.table(hits[0:3]
|
540 |
-
|
541 |
-
|
542 |
-
|
|
|
1 |
# set path
|
2 |
+
import glob, os, sys
|
3 |
+
from udfPreprocess.search import semantic_search
|
4 |
+
sys.path.append('../udfPreprocess')
|
5 |
|
6 |
#import helper
|
7 |
import udfPreprocess.docPreprocessing as pre
|
8 |
import udfPreprocess.cleaning as clean
|
9 |
+
from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
|
10 |
#import needed libraries
|
11 |
import seaborn as sns
|
12 |
from pandas import DataFrame
|
|
|
26 |
from docx.shared import Inches
|
27 |
from docx.shared import Pt
|
28 |
from docx.enum.style import WD_STYLE_TYPE
|
29 |
+
import logging
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
import tempfile
|
32 |
import sqlite3
|
33 |
+
import json
|
34 |
+
import configparser
|
35 |
+
|
36 |
|
37 |
def app():
|
38 |
|
39 |
with st.container():
|
40 |
st.markdown("<h1 style='text-align: center; \
|
41 |
+
color: black;'> Search</h1>",
|
42 |
unsafe_allow_html=True)
|
43 |
st.write(' ')
|
44 |
st.write(' ')
|
45 |
|
46 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
47 |
|
48 |
st.write(
|
49 |
"""
|
|
|
51 |
built in Streamlit for doing keyword search in \
|
52 |
policy document - developed by GIZ Data and the \
|
53 |
Sustainable Development Solution Network.
|
54 |
+
""")
|
|
|
55 |
|
56 |
st.markdown("")
|
57 |
+
|
58 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
with st.sidebar:
|
61 |
+
with open('sample/keywordexample.json','r') as json_file:
|
62 |
+
keywordexample = json.load(json_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
65 |
+
if genre == 'Food':
|
66 |
+
keywordList = keywordexample['Food']
|
67 |
+
elif genre == 'Climate':
|
68 |
+
keywordList = keywordexample['Climate']
|
69 |
+
elif genre == 'Social':
|
70 |
+
keywordList = keywordexample['Social']
|
71 |
+
elif genre == 'Nature':
|
72 |
+
keywordList = keywordexample['Nature']
|
73 |
+
elif genre == 'Implementation':
|
74 |
+
keywordList = keywordexample['Implementation']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
else:
|
76 |
+
keywordList = None
|
|
|
|
|
77 |
|
78 |
+
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
with st.container():
|
82 |
+
if keywordList is not None:
|
83 |
+
queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
|
84 |
+
value="{}".format(keywordList))
|
85 |
+
else:
|
86 |
+
queryList = st.text_input("Please enter here your question and we will look \
|
87 |
+
for an answer in the document OR enter the keyword you \
|
88 |
+
are looking for and we will \
|
89 |
+
we will look for similar context \
|
90 |
+
in the document.",
|
91 |
+
placeholder="Enter keyword here")
|
92 |
+
|
93 |
+
if st.button("Find them"):
|
94 |
+
|
95 |
+
if queryList == "":
|
96 |
+
st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
|
97 |
+
logging.warning("Terminated as no keyword provided")
|
98 |
+
else:
|
99 |
+
|
100 |
+
if 'docs' in st.session_state:
|
101 |
+
docs = st.session_state['docs']
|
102 |
+
paraList = st.session_state['paraList']
|
103 |
+
|
104 |
+
if searchtype == 'Exact Matches':
|
105 |
+
queryList = list(queryList.split(","))
|
106 |
+
logging.info("performing lexical search")
|
107 |
+
tokenized_corpus = bm25TokenizeDoc(paraList)
|
108 |
+
# st.write(len(tokenized_corpus))
|
109 |
+
document_bm25 = BM25Okapi(tokenized_corpus)
|
110 |
+
|
111 |
+
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
112 |
+
st.markdown("##### Top few lexical search (BM25) hits #####")
|
113 |
+
|
114 |
+
for keyword in queryList:
|
115 |
+
|
116 |
+
bm25_hits = lexical_search(keyword,document_bm25)
|
117 |
+
|
118 |
+
|
119 |
+
counter = 0
|
120 |
+
for hit in bm25_hits:
|
121 |
+
if hit['score'] > 0.00:
|
122 |
+
counter += 1
|
123 |
+
if counter == 1:
|
124 |
+
st.markdown("###### Results for keyword: **{}** ######".format(keyword))
|
125 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
+
st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
+
|
128 |
+
st.markdown("---")
|
129 |
+
if counter == 0:
|
130 |
+
st.write("No results found for '**{}**' ".format(keyword))
|
131 |
+
else:
|
132 |
+
logging.info("starting semantic search")
|
133 |
+
with st.spinner("Performing Similar/Contextual search"):
|
134 |
+
query = "Find {} related issues ?".format(queryList)
|
135 |
+
config = configparser.ConfigParser()
|
136 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
137 |
+
threshold = float(config.get('semantic_search','THRESHOLD'))
|
138 |
+
st.write(query)
|
139 |
+
semantic_hits = semantic_search(query,paraList)
|
140 |
+
st.markdown("##### Semantic search hits for {} related topics #####".format(queryList))
|
141 |
+
|
142 |
+
for i,queryhit in enumerate(semantic_hits):
|
143 |
+
|
144 |
+
# st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
|
145 |
+
counter = 0
|
146 |
+
for hit in queryhit:
|
147 |
+
counter += 1
|
148 |
+
|
149 |
+
|
150 |
+
if hit['score'] > threshold:
|
151 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
152 |
+
st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
153 |
+
|
154 |
+
# document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
155 |
+
st.markdown("---")
|
156 |
+
# st.write(semantic_hits)
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
else:
|
162 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
163 |
+
logging.warning("Terminated as no keyword provided")
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/multiapp.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2 |
"""
|
3 |
import streamlit as st
|
4 |
from PIL import Image
|
|
|
|
|
5 |
|
6 |
class MultiApp:
|
7 |
"""Framework for combining multiple streamlit applications.
|
@@ -25,7 +27,7 @@ class MultiApp:
|
|
25 |
def __init__(self):
|
26 |
self.apps = []
|
27 |
|
28 |
-
def add_app(self,
|
29 |
"""Adds a new application.
|
30 |
Parameters
|
31 |
----------
|
@@ -36,16 +38,39 @@ class MultiApp:
|
|
36 |
"""
|
37 |
self.apps.append({
|
38 |
"title": title,
|
|
|
39 |
"function": func
|
40 |
})
|
41 |
|
42 |
def run(self):
|
|
|
43 |
st.sidebar.write(format_func=lambda app: app['title'])
|
44 |
-
image = Image.open('appStore/img/
|
45 |
st.sidebar.image(image)
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"""
|
3 |
import streamlit as st
|
4 |
from PIL import Image
|
5 |
+
from streamlit_option_menu import option_menu
|
6 |
+
from udfPreprocess.uploadAndExample import add_upload
|
7 |
|
8 |
class MultiApp:
|
9 |
"""Framework for combining multiple streamlit applications.
|
|
|
27 |
def __init__(self):
|
28 |
self.apps = []
|
29 |
|
30 |
+
def add_app(self,title,icon, func):
|
31 |
"""Adds a new application.
|
32 |
Parameters
|
33 |
----------
|
|
|
38 |
"""
|
39 |
self.apps.append({
|
40 |
"title": title,
|
41 |
+
"icon": icon,
|
42 |
"function": func
|
43 |
})
|
44 |
|
45 |
def run(self):
|
46 |
+
|
47 |
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
+
image = Image.open('appStore/img/giz_sdsn.jpg')
|
49 |
st.sidebar.image(image)
|
50 |
+
#st.sidebar.markdown("## 📌 Pages ")
|
51 |
+
#app = st.sidebar.radio(
|
52 |
+
# 'Pages',
|
53 |
+
# self.apps,
|
54 |
+
# from streamlit_option_menu import option_menu
|
55 |
+
with st.sidebar:
|
56 |
+
selected = option_menu(None, [page["title"] for page in self.apps],
|
57 |
+
icons=[page["icon"] for page in self.apps],
|
58 |
+
menu_icon="cast", default_index=0)
|
59 |
+
|
60 |
+
|
61 |
+
for index, item in enumerate(self.apps):
|
62 |
+
if item["title"] == selected:
|
63 |
+
self.apps[index]["function"]()
|
64 |
+
break
|
65 |
+
|
66 |
+
# app['function']()
|
67 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
68 |
+
help = 'You can upload the document \
|
69 |
+
or else you can try a example document',
|
70 |
+
options = ('Upload Document', 'Try Example'),
|
71 |
+
horizontal = True)
|
72 |
+
add_upload(choice)
|
73 |
+
# st.sidebar.markdown('')
|
74 |
+
# st.sidebar.markdown(" :cloud: Upload document ")
|
75 |
+
# uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
|
76 |
+
# st.session_state['file'] = uploaded_file
|
appStore/sdg_analysis.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# set path
|
2 |
-
import glob, os, sys;
|
|
|
3 |
|
4 |
#import helper
|
5 |
import udfPreprocess.docPreprocessing as pre
|
@@ -17,10 +18,26 @@ import pandas as pd
|
|
17 |
import docx
|
18 |
from docx.shared import Inches
|
19 |
from docx.shared import Pt
|
20 |
-
from docx.enum.style import WD_STYLE_TYPE
|
|
|
21 |
|
22 |
import tempfile
|
23 |
import sqlite3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def app():
|
26 |
|
@@ -29,154 +46,38 @@ def app():
|
|
29 |
st.write(' ')
|
30 |
st.write(' ')
|
31 |
|
32 |
-
with st.expander("ℹ️ - About this app", expanded=
|
33 |
|
34 |
st.write(
|
35 |
"""
|
36 |
-
The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
|
37 |
-
|
38 |
-
2. SDG Classification for the paragraphs/texts in the document
|
39 |
-
"""
|
40 |
-
)
|
41 |
-
|
42 |
st.markdown("")
|
43 |
|
44 |
-
st.markdown("")
|
45 |
-
st.markdown("## 📌 Step One: Upload document ")
|
46 |
-
|
47 |
-
with st.container():
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
# asking user for either upload or select existing doc
|
52 |
-
choice = st.radio(label = 'Select the Document',
|
53 |
-
help = 'You can upload the document \
|
54 |
-
or else you can try a example document',
|
55 |
-
options = ('Upload Document', 'Try Example'),
|
56 |
-
horizontal = True)
|
57 |
-
|
58 |
-
if choice == 'Upload Document':
|
59 |
-
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
60 |
-
if uploaded_file is not None:
|
61 |
-
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
62 |
-
bytes_data = uploaded_file.getvalue()
|
63 |
-
temp.write(bytes_data)
|
64 |
-
|
65 |
-
st.write("Uploaded Filename: ", uploaded_file.name)
|
66 |
-
file_name = uploaded_file.name
|
67 |
-
file_path = temp.name
|
68 |
-
docs = pre.load_document(file_path, file_name)
|
69 |
-
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
70 |
-
#haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
71 |
-
|
72 |
-
else:
|
73 |
-
# listing the options
|
74 |
-
option = st.selectbox('Select the example document',
|
75 |
-
('Ethiopia: 10 Year Development Plan',
|
76 |
-
'South Africa:Low Emission strategy'))
|
77 |
-
if option is 'South Africa:Low Emission strategy':
|
78 |
-
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
79 |
-
st.write("Selected document:", file_name.split('/')[1])
|
80 |
-
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
81 |
-
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
82 |
-
else:
|
83 |
-
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
84 |
-
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
85 |
-
st.write("Selected document:", file_name.split('/')[1])
|
86 |
-
|
87 |
-
if option is not None:
|
88 |
-
docs = pre.load_document(file_path,file_name)
|
89 |
-
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
90 |
-
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
91 |
-
|
92 |
-
|
93 |
|
94 |
-
if docs is not None:
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
99 |
|
100 |
-
|
101 |
|
102 |
-
keywords = kw_model.extract_keywords(
|
103 |
-
all_text,
|
104 |
-
keyphrase_ngram_range=(1, 3),
|
105 |
-
use_mmr=True,
|
106 |
-
stop_words="english",
|
107 |
-
top_n=10,
|
108 |
-
diversity=0.7,
|
109 |
-
)
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
df = (
|
114 |
-
DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
115 |
-
.sort_values(by="Relevancy", ascending=False)
|
116 |
-
.reset_index(drop=True)
|
117 |
-
)
|
118 |
-
df1 = (
|
119 |
-
DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
120 |
-
.sort_values(by="Relevancy", ascending=False)
|
121 |
-
.reset_index(drop=True)
|
122 |
-
)
|
123 |
-
df.index += 1
|
124 |
|
125 |
-
#
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
)
|
134 |
-
c1, c2, c3 = st.columns([1, 3, 1])
|
135 |
-
|
136 |
-
format_dictionary = {
|
137 |
-
"Relevancy": "{:.1%}",
|
138 |
-
}
|
139 |
-
|
140 |
-
df = df.format(format_dictionary)
|
141 |
-
|
142 |
-
with c2:
|
143 |
-
st.table(df)
|
144 |
-
|
145 |
-
######## SDG classiciation
|
146 |
-
# @st.cache(allow_output_mutation=True)
|
147 |
-
# def load_sdgClassifier():
|
148 |
-
# classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
|
149 |
-
|
150 |
-
# return classifier
|
151 |
-
|
152 |
-
# load from disc (github repo) for performance boost
|
153 |
-
@st.cache(allow_output_mutation=True)
|
154 |
-
def load_sdgClassifier():
|
155 |
-
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
156 |
-
|
157 |
-
return classifier
|
158 |
-
|
159 |
-
classifier = load_sdgClassifier()
|
160 |
-
|
161 |
-
# # not needed, par list comes from pre_processing function already
|
162 |
-
|
163 |
-
# word_list = all_text.split()
|
164 |
-
# len_word_list = len(word_list)
|
165 |
-
# par_list = []
|
166 |
-
# par_len = 130
|
167 |
-
# for i in range(0,len_word_list // par_len):
|
168 |
-
# string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
|
169 |
-
# par_list.append(string_part)
|
170 |
-
|
171 |
-
labels = classifier(par_list)
|
172 |
-
labels_= [(l['label'],l['score']) for l in labels]
|
173 |
-
df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
174 |
-
df2['text'] = par_list
|
175 |
-
df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
176 |
-
df2.index += 1
|
177 |
-
df2 =df2[df2['Relevancy']>.85]
|
178 |
-
x = df2['SDG'].value_counts()
|
179 |
-
df3 = df2.copy()
|
180 |
|
181 |
plt.rcParams['font.size'] = 25
|
182 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
@@ -184,110 +85,92 @@ def app():
|
|
184 |
fig, ax = plt.subplots()
|
185 |
ax.pie(x, colors=colors, radius=2, center=(4, 4),
|
186 |
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
|
187 |
-
fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
188 |
-
st.markdown("
|
|
|
|
|
189 |
|
190 |
c4, c5, c6 = st.columns([2, 2, 2])
|
191 |
|
192 |
# Add styling
|
193 |
cmGreen = sns.light_palette("green", as_cmap=True)
|
194 |
cmRed = sns.light_palette("red", as_cmap=True)
|
195 |
-
df2 = df2.style.background_gradient(
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
)
|
201 |
|
202 |
-
format_dictionary = {
|
203 |
-
|
204 |
-
}
|
205 |
|
206 |
-
df2 = df2.format(format_dictionary)
|
207 |
|
208 |
with c5:
|
209 |
st.pyplot(fig)
|
210 |
|
211 |
c7, c8, c9 = st.columns([1, 10, 1])
|
212 |
with c8:
|
213 |
-
st.table(
|
214 |
-
|
215 |
-
document = docx.Document()
|
216 |
-
document.add_heading('Document name:{}'.format(file_name), 2)
|
217 |
-
# Choosing the top most section of the page
|
218 |
-
section = document.sections[0]
|
219 |
-
|
220 |
-
# Calling the footer
|
221 |
-
footer = section.footer
|
222 |
-
|
223 |
-
# Calling the paragraph already present in
|
224 |
-
# the footer section
|
225 |
-
footer_para = footer.paragraphs[0]
|
226 |
-
|
227 |
-
font_styles = document.styles
|
228 |
-
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
229 |
-
font_object = font_charstyle.font
|
230 |
-
font_object.size = Pt(7)
|
231 |
-
# Adding the centered zoned footer
|
232 |
-
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
233 |
-
|
234 |
-
#footer_para.text = "\tPowered by GIZ Data and the Sustainable Development Solution Network\
|
235 |
-
# hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev"
|
236 |
-
#footer_para.font.size = docx.shared.Pt(6)
|
237 |
-
|
238 |
-
document.add_heading('What is the document about', level=1)
|
239 |
-
t = document.add_table(df1.shape[0]+1, df1.shape[1])
|
240 |
-
|
241 |
-
|
242 |
-
# add the header rows.
|
243 |
-
for j in range(df1.shape[-1]):
|
244 |
-
t.cell(0,j).text = df1.columns[j]
|
245 |
-
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../udfPreprocess')
|
4 |
|
5 |
#import helper
|
6 |
import udfPreprocess.docPreprocessing as pre
|
|
|
18 |
import docx
|
19 |
from docx.shared import Inches
|
20 |
from docx.shared import Pt
|
21 |
+
from docx.enum.style import WD_STYLE_TYPE
|
22 |
+
from udfPreprocess.sdg import sdg_classification
|
23 |
|
24 |
import tempfile
|
25 |
import sqlite3
|
26 |
+
import logging
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
@st.cache(allow_output_mutation=True)
|
32 |
+
def load_keyBert():
|
33 |
+
return KeyBERT()
|
34 |
+
|
35 |
+
@st.cache(allow_output_mutation=True)
|
36 |
+
def load_sdgClassifier():
|
37 |
+
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
38 |
+
return classifier
|
39 |
+
|
40 |
+
|
41 |
|
42 |
def app():
|
43 |
|
|
|
46 |
st.write(' ')
|
47 |
st.write(' ')
|
48 |
|
49 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
50 |
|
51 |
st.write(
|
52 |
"""
|
53 |
+
The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
|
54 |
+
""")
|
|
|
|
|
|
|
|
|
55 |
st.markdown("")
|
56 |
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
with st.container():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
|
|
60 |
|
61 |
+
|
62 |
+
if 'docs' in st.session_state:
|
63 |
+
docs = st.session_state['docs']
|
64 |
+
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
65 |
+
with st.spinner("Running SDG"):
|
66 |
|
67 |
+
df, x = sdg_classification(par_list)
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# classifier = load_sdgClassifier()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# labels = classifier(par_list)
|
73 |
+
# labels_= [(l['label'],l['score']) for l in labels]
|
74 |
+
# df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
75 |
+
# df2['text'] = par_list
|
76 |
+
# df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
77 |
+
# df2.index += 1
|
78 |
+
# df2 =df2[df2['Relevancy']>.85]
|
79 |
+
# x = df2['SDG'].value_counts()
|
80 |
+
# df3 = df2.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
plt.rcParams['font.size'] = 25
|
83 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
|
|
85 |
fig, ax = plt.subplots()
|
86 |
ax.pie(x, colors=colors, radius=2, center=(4, 4),
|
87 |
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
|
88 |
+
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
89 |
+
st.markdown("#### Anything related to SDGs? ####")
|
90 |
+
|
91 |
+
# st.markdown("#### 🎈 Anything related to SDGs? ####")
|
92 |
|
93 |
c4, c5, c6 = st.columns([2, 2, 2])
|
94 |
|
95 |
# Add styling
|
96 |
cmGreen = sns.light_palette("green", as_cmap=True)
|
97 |
cmRed = sns.light_palette("red", as_cmap=True)
|
98 |
+
# df2 = df2.style.background_gradient(
|
99 |
+
# cmap=cmGreen,
|
100 |
+
# subset=[
|
101 |
+
# "Relevancy",
|
102 |
+
# ],
|
103 |
+
# )
|
104 |
|
105 |
+
# format_dictionary = {
|
106 |
+
# "Relevancy": "{:.1%}",
|
107 |
+
# }
|
108 |
|
109 |
+
# df2 = df2.format(format_dictionary)
|
110 |
|
111 |
with c5:
|
112 |
st.pyplot(fig)
|
113 |
|
114 |
c7, c8, c9 = st.columns([1, 10, 1])
|
115 |
with c8:
|
116 |
+
st.table(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
|
119 |
+
# 1. Keyword heatmap \n
|
120 |
+
# 2. SDG Classification for the paragraphs/texts in the document
|
121 |
+
#
|
122 |
+
|
123 |
+
# with st.container():
|
124 |
+
# if 'docs' in st.session_state:
|
125 |
+
# docs = st.session_state['docs']
|
126 |
+
# docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
127 |
+
# # paraList = st.session_state['paraList']
|
128 |
+
# logging.info("keybert")
|
129 |
+
# with st.spinner("Running Key bert"):
|
130 |
+
|
131 |
+
# kw_model = load_keyBert()
|
132 |
+
|
133 |
+
# keywords = kw_model.extract_keywords(
|
134 |
+
# all_text,
|
135 |
+
# keyphrase_ngram_range=(1, 3),
|
136 |
+
# use_mmr=True,
|
137 |
+
# stop_words="english",
|
138 |
+
# top_n=10,
|
139 |
+
# diversity=0.7,
|
140 |
+
# )
|
141 |
+
|
142 |
+
# st.markdown("## 🎈 What is my document about?")
|
143 |
+
|
144 |
+
# df = (
|
145 |
+
# DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
146 |
+
# .sort_values(by="Relevancy", ascending=False)
|
147 |
+
# .reset_index(drop=True)
|
148 |
+
# )
|
149 |
+
# df1 = (
|
150 |
+
# DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
151 |
+
# .sort_values(by="Relevancy", ascending=False)
|
152 |
+
# .reset_index(drop=True)
|
153 |
+
# )
|
154 |
+
# df.index += 1
|
155 |
+
|
156 |
+
# # Add styling
|
157 |
+
# cmGreen = sns.light_palette("green", as_cmap=True)
|
158 |
+
# cmRed = sns.light_palette("red", as_cmap=True)
|
159 |
+
# df = df.style.background_gradient(
|
160 |
+
# cmap=cmGreen,
|
161 |
+
# subset=[
|
162 |
+
# "Relevancy",
|
163 |
+
# ],
|
164 |
+
# )
|
165 |
+
|
166 |
+
# c1, c2, c3 = st.columns([1, 3, 1])
|
167 |
+
|
168 |
+
# format_dictionary = {
|
169 |
+
# "Relevancy": "{:.1%}",
|
170 |
+
# }
|
171 |
+
|
172 |
+
# df = df.format(format_dictionary)
|
173 |
+
|
174 |
+
# with c2:
|
175 |
+
#
|
176 |
+
# st.table(df)
|
sample/keywordexample.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"I will enter my own keyword":[],
|
2 |
+
"Food":"Food security,Nutrition,Diets,Food loss",
|
3 |
+
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
4 |
+
"Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
|
5 |
+
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
+
"Implementation":"Implementation,transformation,reform,integration,strategy,policy"
|
7 |
+
}
|