Spaces:

GIZ
/

SDSN-demo

Runtime error

App Files Files Community

prashant commited on Oct 21, 2022

Commit

72e4dad

1 Parent(s): 49a314a

ver0.2 appstore update

Browse files

Files changed (5) hide show

appStore/info.py +8 -1
appStore/keyword_search.py +114 -490
appStore/multiapp.py +33 -8
appStore/sdg_analysis.py +113 -230
sample/keywordexample.json +7 -0

appStore/info.py CHANGED Viewed

@@ -2,6 +2,13 @@ import streamlit as st
 def app():
     with open('style.css') as f:
         st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
     footer = """
@@ -33,7 +40,7 @@ The collaboration aims to determine the potential of NLP methods for tracking po
     """
     st.markdown(intro, unsafe_allow_html=True)
     st.image("appStore/img/pic1.png", caption="NDC Coherence")
-    st.subheader("Methodology")
     #st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
     #         "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
     #         "Wikipedia context passages retrieved.  Mouseover on the tooltip will show the sentence from the "

 def app():
+    # if 'file' in st.session_state:
+    #   file = st.session_state['file']
+    # else:
+    #   st.sidebar.markdown(" :cloud: Upload document ")
+    #   uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
+    #   st.session_state['file'] = uploaded_file
     with open('style.css') as f:
         st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
     footer = """
     """
     st.markdown(intro, unsafe_allow_html=True)
     st.image("appStore/img/pic1.png", caption="NDC Coherence")
+    #st.subheader("Methodology")
     #st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
     #         "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
     #         "Wikipedia context passages retrieved.  Mouseover on the tooltip will show the sentence from the "

appStore/keyword_search.py CHANGED Viewed

@@ -1,10 +1,12 @@
 # set path
-import glob, os, sys; sys.path.append('../udfPreprocess')
 #import helper
 import udfPreprocess.docPreprocessing as pre
 import udfPreprocess.cleaning as clean
 #import needed libraries
 import seaborn as sns
 from pandas import DataFrame
@@ -24,20 +26,24 @@ import docx
 from docx.shared import Inches
 from docx.shared import Pt
 from docx.enum.style import WD_STYLE_TYPE
 import tempfile
 import sqlite3
 def app():
     with st.container():
         st.markdown("<h1 style='text-align: center;  \
-                      color: black;'> Keyword Search</h1>",
                       unsafe_allow_html=True)
         st.write(' ')
         st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=True):
         st.write(
             """
@@ -45,498 +51,116 @@ def app():
             built in Streamlit for doing keyword search in \
             policy document - developed by GIZ Data and the \
             Sustainable Development Solution Network.
-            """
-        )
         st.markdown("")
-    st.markdown("")
-    st.markdown("### 📌 Step One: Upload document ### ")
-    with st.container():
-      def bm25_tokenizer(text):
-            tokenized_doc = []
-            for token in text.lower().split():
-                token = token.strip(string.punctuation)
-                if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
-                    tokenized_doc.append(token)
-            return tokenized_doc
-      def bm25TokenizeDoc(paraList):
-          tokenized_corpus = []
-          for passage in tqdm(paraList):
-              if len(passage.split()) >256:
-                  temp  = " ".join(passage.split()[:256])
-                  tokenized_corpus.append(bm25_tokenizer(temp))
-                  temp  = " ".join(passage.split()[256:])
-                  tokenized_corpus.append(bm25_tokenizer(temp))
-              else:
-                  tokenized_corpus.append(bm25_tokenizer(passage))
-          return tokenized_corpus
-      def search(keyword):
-                ##### BM25 search (lexical search) #####
-                bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
-                top_n = np.argpartition(bm25_scores, -10)[-10:]
-                bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
-                bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
-                ##### Sematic Search #####
-                # Encode the query using the bi-encoder and find potentially relevant passages
-                #query = "Does document contain {} issues ?".format(keyword)
-                question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
-                hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
-                hits = hits[0]  # Get the hits for the first query
-                ##### Re-Ranking #####
-                # Now, score all retrieved passages with the cross_encoder
-                #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
-                #cross_scores = cross_encoder.predict(cross_inp)
-                # Sort results by the cross-encoder scores
-                #for idx in range(len(cross_scores)):
-                  #   hits[idx]['cross-score'] = cross_scores[idx]
-                return bm25_hits, hits
-      def show_results(keywordList):
-        document = docx.Document()
-        document.add_heading('Document name:{}'.format(file_name), 2)
-        section = document.sections[0]
-          # Calling the footer
-        footer = section.footer
-        # Calling the paragraph already present in
-        # the footer section
-        footer_para = footer.paragraphs[0]
-        font_styles = document.styles
-        font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
-        font_object = font_charstyle.font
-        font_object.size = Pt(7)
-        # Adding the centered zoned footer
-        footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
-        document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
-        for keyword in keywordList:
-          st.write("Results for Query: {}".format(keyword))
-          para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
-          para.font.size = Pt(12)
-          bm25_hits, hits = search(keyword)
-          st.markdown("""
-                      We will provide with 2 kind of results. The 'lexical search' and the semantic search.
-                      """)
-          # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
-          st.markdown("Top few lexical search (BM25) hits")
-          document.add_paragraph("Top few lexical search (BM25) hits")
-          for hit in bm25_hits[0:5]:
-              if hit['score'] > 0.00:
-                  st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                  document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-        #   st.table(bm25_hits[0:3])
-          st.markdown("\n-------------------------\n")
-          st.markdown("Top few Bi-Encoder Retrieval hits")
-          document.add_paragraph("\n-------------------------\n")
-          document.add_paragraph("Top few Bi-Encoder Retrieval hits")
-          hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-          for hit in hits[0:5]:
-            #  if hit['score'] > 0.45:
-              st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-              document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-          #st.table(hits[0:3]
-        document.save('demo.docx')
-        with open("demo.docx", "rb") as file:
-                     btn = st.download_button(
-                     label="Download file",
-                     data=file,
-                     file_name="demo.docx",
-                     mime="txt/docx"
-                       )
-      @st.cache(allow_output_mutation=True)
-      def load_sentenceTransformer(name):
-          return SentenceTransformer(name)
-      docs = None
-      # asking user for either upload or select existing doc
-      choice = st.radio(label = 'Select the Document',
-                        help = 'You can upload the document \
-                        or else you can try a example document',
-                        options = ('Upload Document', 'Try Example'),
-                        horizontal = True)
-      if choice == 'Upload Document':
-        uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
-        if uploaded_file is not None:
-          with tempfile.NamedTemporaryFile(mode="wb") as temp:
-              bytes_data = uploaded_file.getvalue()
-              temp.write(bytes_data)
-              st.write("Uploaded Filename: ", uploaded_file.name)
-              file_name =  uploaded_file.name
-              file_path = temp.name
-              docs = pre.load_document(file_path, file_name)
-              haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-      else:
-        # listing the options
-        option = st.selectbox('Select the example document',
-                              ('South Africa:Low Emission strategy',
-                              'Ethiopia: 10 Year Development Plan'))
-        if option is 'South Africa:Low Emission strategy':
-          file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
-          st.write("Selected document:", file_name.split('/')[1])
-          # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
-          # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
         else:
-          # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
-          file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
-          st.write("Selected document:", file_name.split('/')[1])
-        if option is not None:
-          docs = pre.load_document(file_path,file_name)
-          haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-        if docs is not None:
-          bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
-          bi_encoder.max_seq_length = 64     #Truncate long passages to 256 tokens
-          top_k = 32
-          document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
-          tokenized_corpus = bm25TokenizeDoc(paraList)
-          document_bm25 = BM25Okapi(tokenized_corpus)
-          keywordList = None
-          col1, col2 = st.columns(2)
-          with col1:
-            if st.button('Climate Change Keyword Search'):
-              keywordList = ['extreme weather', 'floods', 'droughts']
-             # show_results(keywordList)
-          with col2:
-            if st.button('Gender Keywords Search'):
-              keywordList =  ['Gender', 'Women empowernment']
-             # show_results(keywordList)
-          keyword = st.text_input("Please enter here \
-                                    what you want to search, \
-                                    we will look for similar context \
-                                    in the document.",
-                                    value="",)
-          if st.button("Find them."):
-            keywordList = [keyword]
-          if keywordList is not None:
-              show_results(keywordList)
-        # @st.cache(allow_output_mutation=True)
-        # def load_sentenceTransformer(name):
-        #     return SentenceTransformer(name)
-        # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
-        # bi_encoder.max_seq_length = 64     #Truncate long passages to 256 tokens
-        # top_k = 32
-        # #@st.cache(allow_output_mutation=True)
-        # #def load_crossEncoder(name):
-        #   #   return CrossEncoder(name)
-        # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-        # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
-        # def bm25_tokenizer(text):
-        #     tokenized_doc = []
-        #     for token in text.lower().split():
-        #         token = token.strip(string.punctuation)
-        #         if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
-        #             tokenized_doc.append(token)
-        #     return tokenized_doc
-        # def bm25TokenizeDoc(paraList):
-        #     tokenized_corpus = []
-        #     for passage in tqdm(paraList):
-        #         if len(passage.split()) >256:
-        #             temp  = " ".join(passage.split()[:256])
-        #             tokenized_corpus.append(bm25_tokenizer(temp))
-        #             temp  = " ".join(passage.split()[256:])
-        #             tokenized_corpus.append(bm25_tokenizer(temp))
-        #         else:
-        #             tokenized_corpus.append(bm25_tokenizer(passage))
-        #     return tokenized_corpus
-        # tokenized_corpus = bm25TokenizeDoc(paraList)
-        # document_bm25 = BM25Okapi(tokenized_corpus)
-        # # def search(keyword):
-        # #         ##### BM25 search (lexical search) #####
-        # #         bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
-        #         top_n = np.argpartition(bm25_scores, -10)[-10:]
-        #         bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
-        #         bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
-        #         ##### Sematic Search #####
-        #         # Encode the query using the bi-encoder and find potentially relevant passages
-        #         #query = "Does document contain {} issues ?".format(keyword)
-        #         question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
-        #         hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
-        #         hits = hits[0]  # Get the hits for the first query
-        #         ##### Re-Ranking #####
-        #         # Now, score all retrieved passages with the cross_encoder
-        #         #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
-        #         #cross_scores = cross_encoder.predict(cross_inp)
-        #         # Sort results by the cross-encoder scores
-        #         #for idx in range(len(cross_scores)):
-        #           #   hits[idx]['cross-score'] = cross_scores[idx]
-        #         return bm25_hits, hits
-        # def show_results(keywordList):
-        #   for keyword in keywordList:
-        #     bm25_hits, hits = search(keyword)
-        #     st.markdown("""
-        #                 We will provide with 2 kind of results. The 'lexical search' and the semantic search.
-        #                 """)
-        #     # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
-        #     st.markdown("Top few lexical search (BM25) hits")
-        #     for hit in bm25_hits[0:5]:
-        #         if hit['score'] > 0.00:
-        #             st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-        #   #   st.table(bm25_hits[0:3])
-        #     st.markdown("\n-------------------------\n")
-        #     st.markdown("Top few Bi-Encoder Retrieval hits")
-        #     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-        #     for hit in hits[0:5]:
-        #       #  if hit['score'] > 0.45:
-        #         st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-        #     #st.table(hits[0:3]
-        # # if docs is not None:
-        # #     col1, col2 = st.columns(2)
-        # #     with col1:
-        # #       if st.button('Gender Keywords Search'):
-        # #         keywordList =  ['Gender Equality', 'Women empowernment']
-        # #         show_results(keywordList)
-        # #     with col2:
-        # #       if st.button('Climate Change Keyword Search'):
-        # #         keywordList = ['extreme weather', 'floods', 'droughts']
-        # #         show_results(keywordList)
-        # #     keyword = st.text_input("Please enter here \
-        # #                              what you want to search, \
-        # #                              we will look for similar context \
-        # #                              in the document.",
-        # #                              value="",)
-        # #     if st.button("Find them."):
-        # #       show_results([keyword])
-            # choice1 = st.radio(label = 'Keyword Search',
-            #               help = 'Search  \
-            #               or else you can try a example document',
-            #               options = ('Enter your own Query', 'Try Example'),
-            #               horizontal = True)
-            # if choice1 == 'Enter your own Query':
-            #   keyword = st.text_input("Please enter here \
-            #                         what you want to search, \
-            #                         we will look for similar context \
-            #                         in the document.",
-            #                         value="",)
-            # else:
-            #   option1 = st.selectbox('Select the Predefined word cluster',
-            #                     ('Gender:[Gender Equality, Women empowernment]',
-            #                     'Climate change:[extreme weather, floods, droughts]',
-            #                     ))
-            #   if option1 == 'Gender:[Gender Equality, Women empowernment]':
-            #     keywordList = ['Gender Equality', 'Women empowernment']
-            #   else:
-            #     keywordList = ['extreme weather', 'floods', 'droughts']
-            # option1 = st.selectbox('Select the Predefined word cluster',
-            #                     ('Gender:[Gender Equality, Women empowernment]',
-            #                     'Climate change:[extreme weather, floods, droughts]',
-            # #                     'Enter your Own Keyword Query'))
-            # if option1 == 'Enter your Own Keyword Query':
-            #   keyword = st.text_input("Please enter here \
-            #                         what you want to search, \
-            #                         we will look for similar context \
-            #                         in the document.",
-            #                         value="",)
-            # elif option1 == 'Gender:[Gender Equality, Women empowernment]':
-            #   keywordList = ['Gender Equality', 'Women empowernment']
-            # elif option1 == 'Climate change:[extreme weather, floods, droughts]':
-            #   keywordList = ['extreme weather', 'floods', 'droughts']
-            # st.markdown("### 📌 Step Two: Search Keyword in Document ### ")
-            # @st.cache(allow_output_mutation=True)
-            # def load_sentenceTransformer(name):
-            #     return SentenceTransformer(name)
-            # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
-            # bi_encoder.max_seq_length = 64     #Truncate long passages to 256 tokens
-            # top_k = 32
-            # #@st.cache(allow_output_mutation=True)
-            # #def load_crossEncoder(name):
-            #   #   return CrossEncoder(name)
-            # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-            # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
-            # def bm25_tokenizer(text):
-            #     tokenized_doc = []
-            #     for token in text.lower().split():
-            #         token = token.strip(string.punctuation)
-            #         if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
-            #             tokenized_doc.append(token)
-            #     return tokenized_doc
-            # def bm25TokenizeDoc(paraList):
-            #     tokenized_corpus = []
-            #     for passage in tqdm(paraList):
-            #         if len(passage.split()) >256:
-            #             temp  = " ".join(passage.split()[:256])
-            #             tokenized_corpus.append(bm25_tokenizer(temp))
-            #             temp  = " ".join(passage.split()[256:])
-            #             tokenized_corpus.append(bm25_tokenizer(temp))
-            #         else:
-            #             tokenized_corpus.append(bm25_tokenizer(passage))
-            #     return tokenized_corpus
-            # tokenized_corpus = bm25TokenizeDoc(paraList)
-            # document_bm25 = BM25Okapi(tokenized_corpus)
-            # def search(keyword):
-            #     ##### BM25 search (lexical search) #####
-            #     bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
-            #     top_n = np.argpartition(bm25_scores, -10)[-10:]
-            #     bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
-            #     bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
-            #     ##### Sematic Search #####
-            #     # Encode the query using the bi-encoder and find potentially relevant passages
-            #     #query = "Does document contain {} issues ?".format(keyword)
-            #     question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
-            #     hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
-            #     hits = hits[0]  # Get the hits for the first query
-            #     ##### Re-Ranking #####
-            #     # Now, score all retrieved passages with the cross_encoder
-            #     #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
-            #     #cross_scores = cross_encoder.predict(cross_inp)
-            #     # Sort results by the cross-encoder scores
-            #     #for idx in range(len(cross_scores)):
-            #       #   hits[idx]['cross-score'] = cross_scores[idx]
-            #     return bm25_hits, hits
-            # def show_results(keywordList):
-            #   for keyword in keywordList:
-            #     bm25_hits, hits = search(keyword)
-            #     st.markdown("""
-            #                 We will provide with 2 kind of results. The 'lexical search' and the semantic search.
-            #                 """)
-            #     # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
-            #     st.markdown("Top few lexical search (BM25) hits")
-            #     for hit in bm25_hits[0:5]:
-            #         if hit['score'] > 0.00:
-            #             st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-            #   #   st.table(bm25_hits[0:3])
-            #     st.markdown("\n-------------------------\n")
-            #     st.markdown("Top few Bi-Encoder Retrieval hits")
-            #     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-            #     for hit in hits[0:5]:
-            #       #  if hit['score'] > 0.45:
-            #         st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-            #     #st.table(hits[0:3]
-            # # if st.button("Find them."):
-            # #     bm25_hits, hits = search(keyword)
-            # #     st.markdown("""
-            # #                 We will provide with 2 kind of results. The 'lexical search' and the semantic search.
-            # #                 """)
-            # #     # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
-            # #     st.markdown("Top few lexical search (BM25) hits")
-            # #     for hit in bm25_hits[0:5]:
-            # #         if hit['score'] > 0.00:
-            # #             st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-            # #   #   st.table(bm25_hits[0:3])
-            # #     st.markdown("\n-------------------------\n")
-            # #     st.markdown("Top few Bi-Encoder Retrieval hits")
-            # #     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-            # #     for hit in hits[0:5]:
-            # #       #  if hit['score'] > 0.45:
-            # #         st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-            # #     #st.table(hits[0:3]

 # set path
+import glob, os, sys
+from udfPreprocess.search import semantic_search
+sys.path.append('../udfPreprocess')
 #import helper
 import udfPreprocess.docPreprocessing as pre
 import udfPreprocess.cleaning as clean
+from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
 #import needed libraries
 import seaborn as sns
 from pandas import DataFrame
 from docx.shared import Inches
 from docx.shared import Pt
 from docx.enum.style import WD_STYLE_TYPE
+import logging
+logger = logging.getLogger(__name__)
 import tempfile
 import sqlite3
+import json
+import configparser
 def app():
     with st.container():
         st.markdown("<h1 style='text-align: center;  \
+                      color: black;'> Search</h1>",
                       unsafe_allow_html=True)
         st.write(' ')
         st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
         st.write(
             """
             built in Streamlit for doing keyword search in \
             policy document - developed by GIZ Data and the \
             Sustainable Development Solution Network.
+            """)
         st.markdown("")
+    with st.sidebar:
+        with open('sample/keywordexample.json','r') as json_file:
+            keywordexample = json.load(json_file)
+        genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
+        if genre == 'Food':
+            keywordList = keywordexample['Food']
+        elif genre == 'Climate':
+            keywordList = keywordexample['Climate']
+        elif genre == 'Social':
+            keywordList = keywordexample['Social']
+        elif genre == 'Nature':
+            keywordList = keywordexample['Nature']
+        elif genre == 'Implementation':
+            keywordList = keywordexample['Implementation']
         else:
+            keywordList = None
+        searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
+    with st.container():
+        if keywordList is not None:
+            queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
+                                    value="{}".format(keywordList))
+        else:
+            queryList = st.text_input("Please enter here your question and we will look \
+                                     for an answer in the document OR enter the keyword you \
+                                     are looking for and we will \
+                                     we will look for similar context \
+                                     in the document.",
+                                    placeholder="Enter keyword here")
+        if st.button("Find them"):
+            if queryList == "":
+                st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
+                logging.warning("Terminated as no keyword provided")
+            else:
+                if 'docs' in st.session_state:
+                    docs = st.session_state['docs']
+                    paraList = st.session_state['paraList']
+                    if searchtype == 'Exact Matches':
+                        queryList = list(queryList.split(","))
+                        logging.info("performing lexical search")
+                        tokenized_corpus = bm25TokenizeDoc(paraList)
+                        # st.write(len(tokenized_corpus))
+                        document_bm25 = BM25Okapi(tokenized_corpus)
+                        with st.spinner("Performing Exact matching search (Lexical search) for you"):
+                            st.markdown("##### Top few lexical search (BM25) hits #####")
+                            for keyword in queryList:
+                                bm25_hits = lexical_search(keyword,document_bm25)
+                                counter = 0
+                                for hit in bm25_hits:
+                                    if hit['score'] > 0.00:
+                                        counter += 1
+                                        if counter == 1:
+                                            st.markdown("###### Results for keyword: **{}** ######".format(keyword))
+                                        # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                        st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
+                                st.markdown("---")
+                                if counter == 0:
+                                    st.write("No results found for '**{}**' ".format(keyword))
+                    else:
+                        logging.info("starting semantic search")
+                        with st.spinner("Performing Similar/Contextual search"):
+                            query = "Find {} related issues ?".format(queryList)
+                            config = configparser.ConfigParser()
+                            config.read_file(open('udfPreprocess/paramconfig.cfg'))
+                            threshold = float(config.get('semantic_search','THRESHOLD'))
+                            st.write(query)
+                            semantic_hits = semantic_search(query,paraList)
+                            st.markdown("##### Semantic search hits for {} related topics #####".format(queryList))
+                            for i,queryhit in enumerate(semantic_hits):
+                                # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
+                                counter = 0
+                                for hit in queryhit:
+                                    counter += 1
+                                    if hit['score'] > threshold:
+                                    # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                        st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
+                                    # document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                st.markdown("---")
+                            # st.write(semantic_hits)
+                else:
+                    st.info("🤔 No document found, please try to upload it at the sidebar!")
+                    logging.warning("Terminated as no keyword provided")

appStore/multiapp.py CHANGED Viewed

@@ -2,6 +2,8 @@
 """
 import streamlit as st
 from PIL import Image
 class MultiApp:
     """Framework for combining multiple streamlit applications.
@@ -25,7 +27,7 @@ class MultiApp:
     def __init__(self):
         self.apps = []
-    def add_app(self, title, func):
         """Adds a new application.
         Parameters
         ----------
@@ -36,16 +38,39 @@ class MultiApp:
         """
         self.apps.append({
             "title": title,
             "function": func
         })
     def run(self):
         st.sidebar.write(format_func=lambda app: app['title'])
-        image = Image.open('appStore/img/giz_sdsn_small.jpg')
         st.sidebar.image(image)
-        app = st.sidebar.radio(
-            'Pages',
-            self.apps,
-            format_func=lambda app: app['title'])
-        app['function']()

 """
 import streamlit as st
 from PIL import Image
+from streamlit_option_menu import option_menu
+from udfPreprocess.uploadAndExample import add_upload
 class MultiApp:
     """Framework for combining multiple streamlit applications.
     def __init__(self):
         self.apps = []
+    def add_app(self,title,icon, func):
         """Adds a new application.
         Parameters
         ----------
         """
         self.apps.append({
             "title": title,
+            "icon": icon,
             "function": func
         })
     def run(self):
         st.sidebar.write(format_func=lambda app: app['title'])
+        image = Image.open('appStore/img/giz_sdsn.jpg')
         st.sidebar.image(image)
+        #st.sidebar.markdown("##  📌 Pages ")
+        #app = st.sidebar.radio(
+        #    'Pages',
+         #   self.apps,
+         #   from streamlit_option_menu import option_menu
+        with st.sidebar:
+            selected = option_menu(None, [page["title"] for page in self.apps],
+                                   icons=[page["icon"] for page in self.apps],
+                                   menu_icon="cast", default_index=0)
+        for index, item in enumerate(self.apps):
+            if item["title"] == selected:
+                self.apps[index]["function"]()
+                break
+        # app['function']()
+        choice = st.sidebar.radio(label = 'Select the Document',
+                            help = 'You can upload the document \
+                            or else you can try a example document',
+                            options = ('Upload Document', 'Try Example'),
+                            horizontal = True)
+        add_upload(choice)
+        # st.sidebar.markdown('')
+        # st.sidebar.markdown(" :cloud: Upload document ")
+        # uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
+        # st.session_state['file'] = uploaded_file

appStore/sdg_analysis.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # set path
-import glob, os, sys; sys.path.append('../udfPreprocess')
 #import helper
 import udfPreprocess.docPreprocessing as pre
@@ -17,10 +18,26 @@ import pandas as pd
 import docx
 from docx.shared import Inches
 from docx.shared import Pt
-from docx.enum.style import WD_STYLE_TYPE
 import tempfile
 import sqlite3
 def app():
@@ -29,154 +46,38 @@ def app():
         st.write(' ')
         st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=True):
         st.write(
             """
-            The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
-                1. Keyword heatmap \n
-                2. SDG Classification for the paragraphs/texts in the document
-            """
-        )
         st.markdown("")
-    st.markdown("")
-    st.markdown("##  📌 Step One: Upload document ")
-    with st.container():
-      docs = None
-        # asking user for either upload or select existing doc
-      choice = st.radio(label = 'Select the Document',
-                        help = 'You can upload the document \
-                        or else you can try a example document',
-                        options = ('Upload Document', 'Try Example'),
-                        horizontal = True)
-      if choice == 'Upload Document':
-        uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
-        if uploaded_file is not None:
-          with tempfile.NamedTemporaryFile(mode="wb") as temp:
-              bytes_data = uploaded_file.getvalue()
-              temp.write(bytes_data)
-              st.write("Uploaded Filename: ", uploaded_file.name)
-              file_name =  uploaded_file.name
-              file_path = temp.name
-              docs = pre.load_document(file_path, file_name)
-              docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
-              #haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-      else:
-        # listing the options
-        option = st.selectbox('Select the example document',
-                              ('Ethiopia: 10 Year Development Plan',
-                              'South Africa:Low Emission strategy'))
-        if option is 'South Africa:Low Emission strategy':
-          file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
-          st.write("Selected document:", file_name.split('/')[1])
-          # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
-          # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
-        else:
-          # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
-          file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
-          st.write("Selected document:", file_name.split('/')[1])
-        if option is not None:
-          docs = pre.load_document(file_path,file_name)
-          # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-          docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
-      if docs is not None:
-                @st.cache(allow_output_mutation=True)
-                def load_keyBert():
-                    return KeyBERT()
-                kw_model = load_keyBert()
-                keywords = kw_model.extract_keywords(
-                all_text,
-                keyphrase_ngram_range=(1, 3),
-                use_mmr=True,
-                stop_words="english",
-                top_n=10,
-                diversity=0.7,
-                )
-                st.markdown("## 🎈 What is my document about?")
-                df = (
-                    DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
-                    .sort_values(by="Relevancy", ascending=False)
-                    .reset_index(drop=True)
-                )
-                df1 = (
-                    DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
-                    .sort_values(by="Relevancy", ascending=False)
-                    .reset_index(drop=True)
-                )
-                df.index += 1
-                # Add styling
-                cmGreen = sns.light_palette("green", as_cmap=True)
-                cmRed = sns.light_palette("red", as_cmap=True)
-                df = df.style.background_gradient(
-                    cmap=cmGreen,
-                    subset=[
-                        "Relevancy",
-                    ],
-                )
-                c1, c2, c3 = st.columns([1, 3, 1])
-                format_dictionary = {
-                    "Relevancy": "{:.1%}",
-                }
-                df = df.format(format_dictionary)
-                with c2:
-                    st.table(df)
-                ######## SDG classiciation
-                # @st.cache(allow_output_mutation=True)
-                # def load_sdgClassifier():
-                #     classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
-                #     return classifier
-                # load from disc (github repo) for performance boost
-                @st.cache(allow_output_mutation=True)
-                def load_sdgClassifier():
-                    classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
-                    return classifier
-                classifier = load_sdgClassifier()
-                # # not needed, par list comes from pre_processing function already
-                # word_list = all_text.split()
-                # len_word_list = len(word_list)
-                # par_list = []
-                # par_len = 130
-                # for i in range(0,len_word_list // par_len):
-                #     string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
-                #     par_list.append(string_part)
-                labels = classifier(par_list)
-                labels_= [(l['label'],l['score']) for l in labels]
-                df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
-                df2['text'] = par_list
-                df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
-                df2.index += 1
-                df2 =df2[df2['Relevancy']>.85]
-                x = df2['SDG'].value_counts()
-                df3 = df2.copy()
                 plt.rcParams['font.size'] = 25
                 colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
@@ -184,110 +85,92 @@ def app():
                 fig, ax = plt.subplots()
                 ax.pie(x, colors=colors, radius=2, center=(4, 4),
                     wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
-                fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
-                st.markdown("## 🎈 Anything related to SDGs?")
                 c4, c5, c6 = st.columns([2, 2, 2])
                 # Add styling
                 cmGreen = sns.light_palette("green", as_cmap=True)
                 cmRed = sns.light_palette("red", as_cmap=True)
-                df2 = df2.style.background_gradient(
-                    cmap=cmGreen,
-                    subset=[
-                        "Relevancy",
-                    ],
-                )
-                format_dictionary = {
-                    "Relevancy": "{:.1%}",
-                }
-                df2 = df2.format(format_dictionary)
                 with c5:
                     st.pyplot(fig)
                 c7, c8, c9 = st.columns([1, 10, 1])
                 with c8:
-                    st.table(df2)
-                document = docx.Document()
-                document.add_heading('Document name:{}'.format(file_name), 2)
-                # Choosing the top most section of the page
-                section = document.sections[0]
-                # Calling the footer
-                footer = section.footer
-                # Calling the paragraph already present in
-                # the footer section
-                footer_para = footer.paragraphs[0]
-                font_styles = document.styles
-                font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
-                font_object = font_charstyle.font
-                font_object.size = Pt(7)
-                # Adding the centered zoned footer
-                footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
-                #footer_para.text = "\tPowered by GIZ Data and the Sustainable Development Solution Network\
-                 #                     hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev"
-                #footer_para.font.size = docx.shared.Pt(6)
-                document.add_heading('What is the document about', level=1)
-                t = document.add_table(df1.shape[0]+1, df1.shape[1])
-                # add the header rows.
-                for j in range(df1.shape[-1]):
-                    t.cell(0,j).text = df1.columns[j]
-                # add the rest of the data frame
-                for i in range(df1.shape[0]):
-                    for j in range(df1.shape[-1]):
-                        t.cell(i+1,j).text = str(df1.values[i,j])
-                document.add_heading('Anything Related to SDG', level=1)
-                document.add_picture('temp.png', width=Inches(3), height=Inches(3))
-                t = document.add_table(df3.shape[0]+1, df3.shape[1])
-                widths = [Inches(0.4), Inches(0.4), Inches(4.5)]
-                # add the header rows.
-                for j in range(df3.shape[-1]):
-                    t.cell(0,j).text = df3.columns[j]
-                    t.cell(0,j).width = widths[j]
-                # add the rest of the data frame
-                for i in range(df3.shape[0]):
-                    for j in range(df3.shape[-1]):
-                        t.cell(i+1,j).width = widths[j]
-                        t.cell(i+1,j).text = str(df3.values[i,j])
-                document.save('demo.docx')
-                #with open('summary.txt', 'w') as f:
-                 #   f.write(df1.to_string())
-                 #   f.write(fig)
-                    #f.write(df2)
-                    # f.write(df3.to_string())
-                with open("demo.docx", "rb") as file:
-                     btn = st.download_button(
-                     label="Download file",
-                     data=file,
-                     file_name="demo.docx",
-                     mime="txt/docx"
-                       )
-                #with document st.download_button(
-                 #  label="Download data as docx",
-                  # data=document,
-                   #file_name='test.docx',
-                   #mime='text/docx',
-                    # )

 # set path
+import glob, os, sys;
+sys.path.append('../udfPreprocess')
 #import helper
 import udfPreprocess.docPreprocessing as pre
 import docx
 from docx.shared import Inches
 from docx.shared import Pt
+from docx.enum.style import WD_STYLE_TYPE
+from udfPreprocess.sdg import sdg_classification
 import tempfile
 import sqlite3
+import logging
+logger = logging.getLogger(__name__)
+@st.cache(allow_output_mutation=True)
+def load_keyBert():
+    return KeyBERT()
+@st.cache(allow_output_mutation=True)
+def load_sdgClassifier():
+    classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
+    return classifier
 def app():
         st.write(' ')
         st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
         st.write(
             """
+            The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
+            """)
         st.markdown("")
+    with st.container():
+        if 'docs' in st.session_state:
+            docs = st.session_state['docs']
+            docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
+            with st.spinner("Running SDG"):
+                df, x = sdg_classification(par_list)
+                # classifier = load_sdgClassifier()
+                # labels = classifier(par_list)
+                # labels_= [(l['label'],l['score']) for l in labels]
+                # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
+                # df2['text'] = par_list
+                # df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+                # df2.index += 1
+                # df2 =df2[df2['Relevancy']>.85]
+                # x = df2['SDG'].value_counts()
+                # df3 = df2.copy()
                 plt.rcParams['font.size'] = 25
                 colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
                 fig, ax = plt.subplots()
                 ax.pie(x, colors=colors, radius=2, center=(4, 4),
                     wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
+                # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
+                st.markdown("#### Anything related to SDGs? ####")
+                # st.markdown("#### 🎈 Anything related to SDGs? ####")
                 c4, c5, c6 = st.columns([2, 2, 2])
                 # Add styling
                 cmGreen = sns.light_palette("green", as_cmap=True)
                 cmRed = sns.light_palette("red", as_cmap=True)
+                # df2 = df2.style.background_gradient(
+                #     cmap=cmGreen,
+                #     subset=[
+                #         "Relevancy",
+                #     ],
+                # )
+                # format_dictionary = {
+                #     "Relevancy": "{:.1%}",
+                # }
+                # df2 = df2.format(format_dictionary)
                 with c5:
                     st.pyplot(fig)
                 c7, c8, c9 = st.columns([1, 10, 1])
                 with c8:
+                    st.table(df)
+#     1. Keyword heatmap \n
+ #               2. SDG Classification for the paragraphs/texts in the document
+ #
+    # with st.container():
+    #     if 'docs' in st.session_state:
+    #         docs = st.session_state['docs']
+    #         docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
+    #         # paraList = st.session_state['paraList']
+    #         logging.info("keybert")
+    #         with st.spinner("Running Key bert"):
+    #             kw_model = load_keyBert()
+    #             keywords = kw_model.extract_keywords(
+    #             all_text,
+    #             keyphrase_ngram_range=(1, 3),
+    #             use_mmr=True,
+    #             stop_words="english",
+    #             top_n=10,
+    #             diversity=0.7,
+    #             )
+    #             st.markdown("## 🎈 What is my document about?")
+    #             df = (
+    #                 DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
+    #                 .sort_values(by="Relevancy", ascending=False)
+    #                 .reset_index(drop=True)
+    #             )
+    #             df1 = (
+    #                 DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
+    #                 .sort_values(by="Relevancy", ascending=False)
+    #                 .reset_index(drop=True)
+    #             )
+    #             df.index += 1
+    #             # Add styling
+    #             cmGreen = sns.light_palette("green", as_cmap=True)
+    #             cmRed = sns.light_palette("red", as_cmap=True)
+    #             df = df.style.background_gradient(
+    #                 cmap=cmGreen,
+    #                 subset=[
+    #                     "Relevancy",
+    #                 ],
+    #             )
+    #             c1, c2, c3 = st.columns([1, 3, 1])
+    #             format_dictionary = {
+    #                 "Relevancy": "{:.1%}",
+    #             }
+    #             df = df.format(format_dictionary)
+    #             with c2:
+    #
+    #               st.table(df)

sample/keywordexample.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{"I will enter my own keyword":[],
+"Food":"Food security,Nutrition,Diets,Food loss",
+"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
+"Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
+"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
+"Implementation":"Implementation,transformation,reform,integration,strategy,policy"
+}