Spaces:

GIZ
/

SDSN-demo

Runtime error

App Files Files Community

prashant commited on Oct 21, 2022

Commit

f47e7d4

1 Parent(s): 8f1008c

reverting chnages

Browse files

Files changed (6) hide show

udfPreprocess/cleaning.py +4 -16
udfPreprocess/docPreprocessing.py +6 -6
udfPreprocess/paramconfig.cfg +0 -12
udfPreprocess/sdg.py +0 -57
udfPreprocess/search.py +0 -145
udfPreprocess/uploadAndExample.py +0 -48

udfPreprocess/cleaning.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import pandas as pd
 import numpy as np
 import string
@@ -11,7 +10,7 @@ import streamlit as st
 from haystack.nodes import PreProcessor
 '''basic cleaning - suitable for transformer models'''
-def basic(s,SDG = False):
     """
     :param s: string to be processed
     :return: processed string: see comments in the source code for more info
@@ -24,15 +23,6 @@ def basic(s,SDG = False):
     # Remove URLs
     s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
     s = re.sub(r"http\S+", " ", s)
-    if SDG == True:
-        s = s.lower()
-        translator = str.maketrans(' ', ' ', string.punctuation)
-        s = s.translate(translator)
-        s = re.sub('\n', ' ', s)
-        s = re.sub("\'", " ", s)
-        s = re.sub(r'\d+', ' ', s)
-        s = re.sub(r'\W+', ' ', s)
     # Remove new line characters
     #s = re.sub('\n', ' ', s)
@@ -69,10 +59,9 @@ def preprocessingForSDG(document):
     for i in document:
         docs_processed = preprocessor.process([i])
         for item in docs_processed:
-            item.content = basic(item.content, SDG = True)
-    with st.spinner("👑 document being splitted into paragraphs"):
-        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)
@@ -104,8 +93,7 @@ def preprocessing(document):
         for item in docs_processed:
             item.content = basic(item.content)
-    with st.spinner("👑 document being splitted into paragraphs"):
-        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)

 import pandas as pd
 import numpy as np
 import string
 from haystack.nodes import PreProcessor
 '''basic cleaning - suitable for transformer models'''
+def basic(s):
     """
     :param s: string to be processed
     :return: processed string: see comments in the source code for more info
     # Remove URLs
     s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
     s = re.sub(r"http\S+", " ", s)
     # Remove new line characters
     #s = re.sub('\n', ' ', s)
     for i in document:
         docs_processed = preprocessor.process([i])
         for item in docs_processed:
+            item.content = basic(item.content)
+    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)
         for item in docs_processed:
             item.content = basic(item.content)
+    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)

udfPreprocess/docPreprocessing.py CHANGED Viewed

@@ -65,11 +65,11 @@ def load_document(
     This can happen whith certain pdf types.'''
     for i in documents:
         if i.content == "":
-            with st.spinner("using pdfplumber"):
-                text = []
-                with pdfplumber.open(file_path) as pdf:
-                    for page in pdf.pages:
-                        text.append(page.extract_text())
-                i.content = ' '.join([page for page in text])
     return documents

     This can happen whith certain pdf types.'''
     for i in documents:
         if i.content == "":
+            st.write("using pdfplumber")
+            text = []
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    text.append(page.extract_text())
+            i.content = ' '.join([page for page in text])
     return documents

udfPreprocess/paramconfig.cfg DELETED Viewed

@@ -1,12 +0,0 @@
-[lexical_search]
-TOP_K = 10
-THRESHOLD = 0.1
-[semantic_search]
-TOP_K = 10
-MAX_SEQ_LENGTH = 64
-MODEL_NAME = msmarco-distilbert-cos-v5
-THRESHOLD = 0.1
-[sdg]
-THRESHOLD = 0.85

udfPreprocess/sdg.py DELETED Viewed

@@ -1,57 +0,0 @@
-import glob, os, sys;
-sys.path.append('../udfPreprocess')
-#import helper
-import udfPreprocess.docPreprocessing as pre
-import udfPreprocess.cleaning as clean
-#import needed libraries
-import seaborn as sns
-from pandas import DataFrame
-from keybert import KeyBERT
-from transformers import pipeline
-import matplotlib.pyplot as plt
-import numpy as np
-import streamlit as st
-import pandas as pd
-import docx
-from docx.shared import Inches
-from docx.shared import Pt
-from docx.enum.style import WD_STYLE_TYPE
-import tempfile
-import sqlite3
-import logging
-logger = logging.getLogger(__name__)
-import configparser
-@st.cache(allow_output_mutation=True)
-def load_sdgClassifier():
-    classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
-    logging.info("Loading classifier")
-    return classifier
-def sdg_classification(par_list):
-    logging.info("running SDG classifiication")
-    config = configparser.ConfigParser()
-    config.read_file(open('udfPreprocess/paramconfig.cfg'))
-    threshold = float(config.get('sdg','THRESHOLD'))
-    classifier = load_sdgClassifier()
-    labels = classifier(par_list)
-    labels_= [(l['label'],l['score']) for l in labels]
-    # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
-    df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
-    df2['text'] = par_list
-    df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
-    df2.index += 1
-    df2 =df2[df2['Relevancy']>threshold]
-    x = df2['SDG'].value_counts()
-    df3 = df2.copy()
-    df3= df3.drop(['Relevancy'], axis = 1)
-    return df3, x

udfPreprocess/search.py DELETED Viewed

@@ -1,145 +0,0 @@
-import glob, os, sys; sys.path.append('../udfPreprocess')
-#import helper
-import udfPreprocess.docPreprocessing as pre
-import udfPreprocess.cleaning as clean
-#import needed libraries
-import seaborn as sns
-from pandas import DataFrame
-from sentence_transformers import SentenceTransformer, CrossEncoder, util
-# from keybert import KeyBERT
-from transformers import pipeline
-import matplotlib.pyplot as plt
-import numpy as np
-import streamlit as st
-import pandas as pd
-from rank_bm25 import BM25Okapi
-from sklearn.feature_extraction import _stop_words
-import string
-from tqdm.autonotebook import tqdm
-import numpy as np
-import docx
-from docx.shared import Inches
-from docx.shared import Pt
-from docx.enum.style import WD_STYLE_TYPE
-import logging
-logger = logging.getLogger(__name__)
-import tempfile
-import sqlite3
-import configparser
-### These are lexcial search related functions/methods#####
-def bm25_tokenizer(text):
-    tokenized_doc = []
-    for token in text.lower().split():
-        token = token.strip(string.punctuation)
-        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
-            tokenized_doc.append(token)
-    return tokenized_doc
-def bm25TokenizeDoc(paraList):
-    tokenized_corpus = []
-    ##########Commenting this for now########### will incorporate paragrpah splitting later.
-    # for passage in tqdm(paraList):
-        # if len(passage.split()) >256:
-        #     # st.write("Splitting")
-        #     temp  = " ".join(passage.split()[:256])
-        #     tokenized_corpus.append(bm25_tokenizer(temp))
-        #     temp  = " ".join(passage.split()[256:])
-        #     tokenized_corpus.append(bm25_tokenizer(temp))
-        # else:
-        #     tokenized_corpus.append(bm25_tokenizer(passage))
-    ######################################################################################33333
-    for passage in tqdm(paraList):
-        tokenized_corpus.append(bm25_tokenizer(passage))
-    return tokenized_corpus
-def lexical_search(keyword, document_bm25):
-    config = configparser.ConfigParser()
-    config.read_file(open('udfPreprocess/paramconfig.cfg'))
-    top_k = int(config.get('lexical_search','TOP_K'))
-    bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
-    top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
-    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
-    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
-    return bm25_hits
-@st.cache(allow_output_mutation=True)
-def load_sentenceTransformer(name):
-    return SentenceTransformer(name)
-def semantic_search(keywordlist,paraList):
-    ##### Sematic Search #####
-    #query = "Does document contain {} issues ?".format(keyword)
-    config = configparser.ConfigParser()
-    config.read_file(open('udfPreprocess/paramconfig.cfg'))
-    model_name = config.get('semantic_search','MODEL_NAME')
-    bi_encoder = load_sentenceTransformer(model_name)
-    bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH'))     #Truncate long passages to 256 tokens
-    top_k = int(config.get('semantic_search','TOP_K'))
-    document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
-    question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
-    hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
-    return hits
-def show_results(keywordList):
-            document = docx.Document()
-            # document.add_heading('Document name:{}'.format(file_name), 2)
-            section = document.sections[0]
-           # Calling the footer
-            footer = section.footer
-            # Calling the paragraph already present in
-        # the footer section
-            footer_para = footer.paragraphs[0]
-            font_styles = document.styles
-            font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
-            font_object = font_charstyle.font
-            font_object.size = Pt(7)
-        # Adding the centered zoned footer
-            footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
-            document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
-            for keyword in keywordList:
-              st.write("Results for Query: {}".format(keyword))
-              para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
-              para.font.size = Pt(12)
-              bm25_hits, hits = search(keyword)
-              st.markdown("""
-                      We will provide with 2 kind of results. The 'lexical search' and the semantic search.
-                      """)
-              # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
-              st.markdown("Top few lexical search (BM25) hits")
-              document.add_paragraph("Top few lexical search (BM25) hits")
-              for hit in bm25_hits[0:5]:
-                  if hit['score'] > 0.00:
-                      st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                      document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-        #   st.table(bm25_hits[0:3])
-              st.markdown("\n-------------------------\n")
-              st.markdown("Top few Bi-Encoder Retrieval hits")
-              document.add_paragraph("\n-------------------------\n")
-              document.add_paragraph("Top few Bi-Encoder Retrieval hits")
-              hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-              for hit in hits[0:5]:
-                #  if hit['score'] > 0.45:
-                  st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                  document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))

udfPreprocess/uploadAndExample.py DELETED Viewed

@@ -1,48 +0,0 @@
-import streamlit as st
-import tempfile
-import udfPreprocess.docPreprocessing as pre
-import udfPreprocess.cleaning as clean
-def add_upload(choice):
-    if choice == 'Upload Document':
-          uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
-          if uploaded_file is not None:
-            with tempfile.NamedTemporaryFile(mode="wb") as temp:
-                bytes_data = uploaded_file.getvalue()
-                temp.write(bytes_data)
-                st.session_state['filename'] = uploaded_file.name
-                # st.write("Uploaded Filename: ", uploaded_file.name)
-                file_name =  uploaded_file.name
-                file_path = temp.name
-                docs = pre.load_document(file_path, file_name)
-                haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-                st.session_state['docs'] = docs
-                st.session_state['paraList'] = paraList
-    else:
-          # listing the options
-          option = st.sidebar.selectbox('Select the example document',
-                                ('South Africa:Low Emission strategy',
-                                'Ethiopia: 10 Year Development Plan'))
-          if option is 'South Africa:Low Emission strategy':
-            file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
-            st.session_state['filename'] = file_name
-            # st.write("Selected document:", file_name.split('/')[1])
-            # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
-            # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
-          else:
-            # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
-            file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
-            st.session_state['filename'] = file_name
-            # st.write("Selected document:", file_name.split('/')[1])
-          if option is not None:
-            docs = pre.load_document(file_path,file_name)
-            haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-            st.session_state['docs'] = docs
-            st.session_state['paraList'] = paraList