new_version
Browse files- .gitattributes +3 -0
- app.py +17 -0
- appStore/__init__.py +1 -0
- appStore/coherence.py +231 -0
- appStore/img/giz_sdsn.jpg +3 -0
- appStore/img/paris.png +3 -0
- appStore/img/pic1.png +3 -0
- appStore/info.py +47 -0
- appStore/keyword_search.py +504 -0
- appStore/multiapp.py +51 -0
- appStore/sdg_analysis.py +204 -0
- ndcs/cca.txt +81 -0
- ndcs/ccm.txt +86 -0
- ndcs/countryList.txt +170 -0
- packages.txt +2 -0
- requirements.txt +14 -0
- sample/Ethiopia_s_2021_10 Year Development Plan.txt +737 -0
- sample/South Africa_s Low Emission Development Strategy.txt +0 -0
- style.css +179 -0
- udfPreprocess/__init__.py +1 -0
- udfPreprocess/cleaning.py +156 -0
- udfPreprocess/docPreprocessing.py +75 -0
.gitattributes
CHANGED
@@ -29,3 +29,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
29 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
29 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
32 |
+
appStore/img/giz_sdsn.jpg filter=lfs diff=lfs merge=lfs -text
|
33 |
+
appStore/img/paris.png filter=lfs diff=lfs merge=lfs -text
|
34 |
+
appStore/img/pic1.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import appStore.keyword_search as keyword_search
|
2 |
+
import appStore.sdg_analysis as sdg_analysis
|
3 |
+
import appStore.coherence as coherence
|
4 |
+
import appStore.info as info
|
5 |
+
from appStore.multiapp import MultiApp
|
6 |
+
import streamlit as st
|
7 |
+
|
8 |
+
st.set_page_config(f'SDSN x GIZ Policy Action Tracking v0.1', layout="wide")
|
9 |
+
|
10 |
+
app = MultiApp()
|
11 |
+
|
12 |
+
app.add_app("Analyse Policy Document", sdg_analysis.app)
|
13 |
+
app.add_app("KeyWord Search", keyword_search.app)
|
14 |
+
app.add_app("Check Coherence", coherence.app)
|
15 |
+
app.add_app("Info", info.app)
|
16 |
+
|
17 |
+
app.run()
|
appStore/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# creating appstore package
|
appStore/coherence.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
+
# from keybert import KeyBERT
|
14 |
+
from transformers import pipeline
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
import numpy as np
|
17 |
+
import streamlit as st
|
18 |
+
import pandas as pd
|
19 |
+
from rank_bm25 import BM25Okapi
|
20 |
+
from sklearn.feature_extraction import _stop_words
|
21 |
+
import string
|
22 |
+
from tqdm.autonotebook import tqdm
|
23 |
+
import numpy as np
|
24 |
+
import urllib.request
|
25 |
+
import ast
|
26 |
+
import tempfile
|
27 |
+
import sqlite3
|
28 |
+
import json
|
29 |
+
import urllib.request
|
30 |
+
import ast
|
31 |
+
def app():
|
32 |
+
# Sidebar
|
33 |
+
st.sidebar.title('Check Coherence')
|
34 |
+
st.sidebar.write(' ')
|
35 |
+
with open('ndcs/countryList.txt') as dfile:
|
36 |
+
countryList = dfile.read()
|
37 |
+
|
38 |
+
countryList = ast.literal_eval(countryList)
|
39 |
+
countrynames = list(countryList.keys())
|
40 |
+
|
41 |
+
option = st.sidebar.selectbox('Select Country', (countrynames))
|
42 |
+
countryCode = countryList[option]
|
43 |
+
|
44 |
+
|
45 |
+
with st.container():
|
46 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
|
47 |
+
st.write(' ')
|
48 |
+
st.write(' ')
|
49 |
+
|
50 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
51 |
+
|
52 |
+
st.write(
|
53 |
+
"""
|
54 |
+
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
|
55 |
+
"""
|
56 |
+
)
|
57 |
+
|
58 |
+
st.markdown("")
|
59 |
+
|
60 |
+
st.markdown("")
|
61 |
+
st.markdown("## 📌 Step One: Upload document of the country selected ")
|
62 |
+
|
63 |
+
with st.container():
|
64 |
+
docs = None
|
65 |
+
# asking user for either upload or select existing doc
|
66 |
+
choice = st.radio(label = 'Select the Document',
|
67 |
+
help = 'You can upload the document \
|
68 |
+
or else you can try a example document.',
|
69 |
+
options = ('Upload Document', 'Try Example'),
|
70 |
+
horizontal = True)
|
71 |
+
|
72 |
+
if choice == 'Upload Document':
|
73 |
+
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
74 |
+
if uploaded_file is not None:
|
75 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
76 |
+
bytes_data = uploaded_file.getvalue()
|
77 |
+
temp.write(bytes_data)
|
78 |
+
|
79 |
+
st.write("Uploaded Filename: ", uploaded_file.name)
|
80 |
+
file_name = uploaded_file.name
|
81 |
+
file_path = temp.name
|
82 |
+
docs = pre.load_document(file_path, file_name)
|
83 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
84 |
+
|
85 |
+
else:
|
86 |
+
# listing the options
|
87 |
+
option = st.selectbox('Select the example document',
|
88 |
+
('South Africa:Low Emission strategy',
|
89 |
+
'Ethiopia: 10 Year Development Plan'))
|
90 |
+
if option is 'South Africa:Low Emission strategy':
|
91 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
92 |
+
countryCode = countryList['South Africa']
|
93 |
+
st.write("Selected document:", file_name.split('/')[1])
|
94 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
95 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
96 |
+
else:
|
97 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
98 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
99 |
+
countryCode = countryList['Ethiopia']
|
100 |
+
st.write("Selected document:", file_name.split('/')[1])
|
101 |
+
|
102 |
+
if option is not None:
|
103 |
+
docs = pre.load_document(file_path,file_name)
|
104 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
105 |
+
|
106 |
+
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
107 |
+
cca_sent = dfile.read()
|
108 |
+
|
109 |
+
cca_sent = ast.literal_eval(cca_sent)
|
110 |
+
|
111 |
+
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
112 |
+
ccm_sent = dfile.read()
|
113 |
+
|
114 |
+
ccm_sent = ast.literal_eval(ccm_sent)
|
115 |
+
|
116 |
+
with open('ndcs/countryList.txt') as dfile:
|
117 |
+
countryList = dfile.read()
|
118 |
+
|
119 |
+
countryList = ast.literal_eval(countryList)
|
120 |
+
|
121 |
+
def get_document(countryCode: str):
|
122 |
+
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
123 |
+
with urllib.request.urlopen(link) as urlfile:
|
124 |
+
data = json.loads(urlfile.read())
|
125 |
+
categoriesData = {}
|
126 |
+
categoriesData['categories']= data['categories']
|
127 |
+
categoriesData['subcategories']= data['subcategories']
|
128 |
+
keys_sub = categoriesData['subcategories'].keys()
|
129 |
+
documentType= 'NDCs'
|
130 |
+
if documentType in data.keys():
|
131 |
+
if countryCode in data[documentType].keys():
|
132 |
+
get_dict = {}
|
133 |
+
for key, value in data[documentType][countryCode].items():
|
134 |
+
if key not in ['country_name','region_id', 'region_name']:
|
135 |
+
get_dict[key] = value['classification']
|
136 |
+
else:
|
137 |
+
get_dict[key] = value
|
138 |
+
else:
|
139 |
+
return None
|
140 |
+
else:
|
141 |
+
return None
|
142 |
+
|
143 |
+
country = {}
|
144 |
+
for key in categoriesData['categories']:
|
145 |
+
country[key]= {}
|
146 |
+
for key,value in categoriesData['subcategories'].items():
|
147 |
+
country[value['category']][key] = get_dict[key]
|
148 |
+
|
149 |
+
return country
|
150 |
+
|
151 |
+
# country_ndc = get_document('NDCs', countryList[option])
|
152 |
+
|
153 |
+
def countrySpecificCCA(cca_sent, threshold, countryCode):
|
154 |
+
temp = {}
|
155 |
+
doc = get_document(countryCode)
|
156 |
+
for key,value in cca_sent.items():
|
157 |
+
id_ = doc['climate change adaptation'][key]['id']
|
158 |
+
if id_ >threshold:
|
159 |
+
temp[key] = value['id'][id_]
|
160 |
+
return temp
|
161 |
+
|
162 |
+
|
163 |
+
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
164 |
+
temp = {}
|
165 |
+
doc = get_document(countryCode)
|
166 |
+
for key,value in ccm_sent.items():
|
167 |
+
id_ = doc['climate change mitigation'][key]['id']
|
168 |
+
if id_ >threshold:
|
169 |
+
temp[key] = value['id'][id_]
|
170 |
+
|
171 |
+
return temp
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
if docs is not None:
|
176 |
+
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
177 |
+
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
178 |
+
#st.write(sent_ccm)
|
179 |
+
@st.cache(allow_output_mutation=True)
|
180 |
+
def load_sentenceTransformer(name):
|
181 |
+
return SentenceTransformer(name)
|
182 |
+
model = load_sentenceTransformer('all-MiniLM-L6-v2')
|
183 |
+
|
184 |
+
document_embeddings = model.encode(paraList, show_progress_bar=True)
|
185 |
+
|
186 |
+
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
187 |
+
if genre == 'Climate Change Adaptation':
|
188 |
+
sent_dict = sent_cca
|
189 |
+
sent_labels = []
|
190 |
+
for key,sent in sent_dict.items():
|
191 |
+
sent_labels.append(sent)
|
192 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
193 |
+
similarity_high_threshold = 0.55
|
194 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
195 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
196 |
+
|
197 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
198 |
+
|
199 |
+
|
200 |
+
else:
|
201 |
+
sent_dict = sent_ccm
|
202 |
+
sent_labels = []
|
203 |
+
for key,sent in sent_dict.items():
|
204 |
+
sent_labels.append(sent)
|
205 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
206 |
+
similarity_high_threshold = 0.55
|
207 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
208 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
209 |
+
|
210 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
211 |
+
|
212 |
+
|
213 |
+
# sent_labels = []
|
214 |
+
# for key,sent in sent_dict.items():
|
215 |
+
# sent_labels.append(sent)
|
216 |
+
|
217 |
+
|
218 |
+
# label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
219 |
+
|
220 |
+
#similarity_high_threshold = 0.55
|
221 |
+
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
222 |
+
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
223 |
+
|
224 |
+
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
225 |
+
|
226 |
+
for _label_idx, _paragraph_idx in positive_indices:
|
227 |
+
st.write("This paragraph: \n")
|
228 |
+
st.write(paraList[_paragraph_idx])
|
229 |
+
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
230 |
+
st.write('-'*10)
|
231 |
+
|
appStore/img/giz_sdsn.jpg
ADDED
Git LFS Details
|
appStore/img/paris.png
ADDED
Git LFS Details
|
appStore/img/pic1.png
ADDED
Git LFS Details
|
appStore/info.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def app():
|
5 |
+
with open('style.css') as f:
|
6 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
7 |
+
footer = """
|
8 |
+
<div class="footer-custom">
|
9 |
+
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
10 |
+
<a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
|
11 |
+
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
12 |
+
Guidance & Feedback - Maren Bernlöhr | Manuel Kuhn </a>
|
13 |
+
</div>
|
14 |
+
"""
|
15 |
+
st.markdown(footer, unsafe_allow_html=True)
|
16 |
+
|
17 |
+
st.subheader("Policy Action Tracker Manual")
|
18 |
+
intro = """
|
19 |
+
<div class="text">
|
20 |
+
The manual extraction of relevant information from text documents is a time-consuming task for any policy analyst.
|
21 |
+
As the amount and length of public policy documents in relation to sustainable development (such as National Development Plans and
|
22 |
+
Nationally Determined Contributions) continuously increases, a major challenge for policy action tracking – the evaluation of stated
|
23 |
+
goals and targets and their actual implementation on the ground – arises. Luckily, Artificial Intelligence (AI) and Natural Language Processing (NLP)
|
24 |
+
methods can help in shortening and easing this task for policy analysts.
|
25 |
+
For this purpose, the United Nations Sustainable Development Solutions Network (SDSN) and the Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ) GmbH
|
26 |
+
are collaborating since 2021 in the development of an AI-powered open-source web application that helps find and extract relevant information from public policy
|
27 |
+
documents faster to facilitate evidence-based decision-making processes in sustainable development and beyond.
|
28 |
+
<ul>
|
29 |
+
<li>Analizing the policy document</li>
|
30 |
+
<li>finding SDG related content</li>
|
31 |
+
<li>Make it searchable</li>
|
32 |
+
<li>compare it to the national NDC</li>
|
33 |
+
</ul>
|
34 |
+
</div>
|
35 |
+
<br>
|
36 |
+
"""
|
37 |
+
st.markdown(intro, unsafe_allow_html=True)
|
38 |
+
st.image("appStore/img/pic1.png", caption="NDC Coherence")
|
39 |
+
st.subheader("Methodology")
|
40 |
+
#st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
|
41 |
+
# "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
|
42 |
+
# "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
|
43 |
+
# "Wikipedia context passage. If a sentence similarity is 1.0, the seq2seq model extracted and "
|
44 |
+
# "copied the sentence verbatim from Wikipedia context passages. Lower values of sentence "
|
45 |
+
# "similarity indicate the seq2seq model is struggling to generate a relevant sentence for the question "
|
46 |
+
# "asked.")
|
47 |
+
#st.image("wikipedia_answer.png", caption="Answer with similarity tooltips")
|
appStore/keyword_search.py
ADDED
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
12 |
+
# from keybert import KeyBERT
|
13 |
+
from transformers import pipeline
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
+
import numpy as np
|
16 |
+
import streamlit as st
|
17 |
+
import pandas as pd
|
18 |
+
from rank_bm25 import BM25Okapi
|
19 |
+
from sklearn.feature_extraction import _stop_words
|
20 |
+
import string
|
21 |
+
from tqdm.autonotebook import tqdm
|
22 |
+
import numpy as np
|
23 |
+
|
24 |
+
import tempfile
|
25 |
+
import sqlite3
|
26 |
+
|
27 |
+
def app():
|
28 |
+
|
29 |
+
with st.container():
|
30 |
+
st.markdown("<h1 style='text-align: center; \
|
31 |
+
color: black;'> Keyword Search</h1>",
|
32 |
+
unsafe_allow_html=True)
|
33 |
+
st.write(' ')
|
34 |
+
st.write(' ')
|
35 |
+
|
36 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
37 |
+
|
38 |
+
st.write(
|
39 |
+
"""
|
40 |
+
The *Keyword Search* app is an easy-to-use interface \
|
41 |
+
built in Streamlit for doing keyword search in \
|
42 |
+
policy document - developed by GIZ Data and the \
|
43 |
+
Sustainable Development Solution Network.
|
44 |
+
"""
|
45 |
+
)
|
46 |
+
|
47 |
+
st.markdown("")
|
48 |
+
|
49 |
+
st.markdown("")
|
50 |
+
st.markdown("### 📌 Step One: Upload document ### ")
|
51 |
+
|
52 |
+
with st.container():
|
53 |
+
def bm25_tokenizer(text):
|
54 |
+
tokenized_doc = []
|
55 |
+
for token in text.lower().split():
|
56 |
+
token = token.strip(string.punctuation)
|
57 |
+
|
58 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
59 |
+
tokenized_doc.append(token)
|
60 |
+
return tokenized_doc
|
61 |
+
|
62 |
+
def bm25TokenizeDoc(paraList):
|
63 |
+
tokenized_corpus = []
|
64 |
+
for passage in tqdm(paraList):
|
65 |
+
if len(passage.split()) >256:
|
66 |
+
temp = " ".join(passage.split()[:256])
|
67 |
+
tokenized_corpus.append(bm25_tokenizer(temp))
|
68 |
+
temp = " ".join(passage.split()[256:])
|
69 |
+
tokenized_corpus.append(bm25_tokenizer(temp))
|
70 |
+
else:
|
71 |
+
tokenized_corpus.append(bm25_tokenizer(passage))
|
72 |
+
|
73 |
+
return tokenized_corpus
|
74 |
+
def search(keyword):
|
75 |
+
##### BM25 search (lexical search) #####
|
76 |
+
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
77 |
+
top_n = np.argpartition(bm25_scores, -10)[-10:]
|
78 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
79 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
80 |
+
|
81 |
+
##### Sematic Search #####
|
82 |
+
# Encode the query using the bi-encoder and find potentially relevant passages
|
83 |
+
#query = "Does document contain {} issues ?".format(keyword)
|
84 |
+
question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
85 |
+
|
86 |
+
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
87 |
+
hits = hits[0] # Get the hits for the first query
|
88 |
+
|
89 |
+
|
90 |
+
##### Re-Ranking #####
|
91 |
+
# Now, score all retrieved passages with the cross_encoder
|
92 |
+
#cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
93 |
+
#cross_scores = cross_encoder.predict(cross_inp)
|
94 |
+
|
95 |
+
# Sort results by the cross-encoder scores
|
96 |
+
#for idx in range(len(cross_scores)):
|
97 |
+
# hits[idx]['cross-score'] = cross_scores[idx]
|
98 |
+
|
99 |
+
|
100 |
+
return bm25_hits, hits
|
101 |
+
|
102 |
+
def show_results(keywordList):
|
103 |
+
for keyword in keywordList:
|
104 |
+
st.write("Results for Query: {}".format(keyword))
|
105 |
+
bm25_hits, hits = search(keyword)
|
106 |
+
|
107 |
+
st.markdown("""
|
108 |
+
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
109 |
+
""")
|
110 |
+
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
111 |
+
st.markdown("Top few lexical search (BM25) hits")
|
112 |
+
for hit in bm25_hits[0:5]:
|
113 |
+
if hit['score'] > 0.00:
|
114 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
# st.table(bm25_hits[0:3])
|
121 |
+
|
122 |
+
st.markdown("\n-------------------------\n")
|
123 |
+
st.markdown("Top few Bi-Encoder Retrieval hits")
|
124 |
+
|
125 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
126 |
+
for hit in hits[0:5]:
|
127 |
+
# if hit['score'] > 0.45:
|
128 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
129 |
+
#st.table(hits[0:3]
|
130 |
+
|
131 |
+
|
132 |
+
@st.cache(allow_output_mutation=True)
|
133 |
+
def load_sentenceTransformer(name):
|
134 |
+
return SentenceTransformer(name)
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
docs = None
|
139 |
+
# asking user for either upload or select existing doc
|
140 |
+
choice = st.radio(label = 'Select the Document',
|
141 |
+
help = 'You can upload the document \
|
142 |
+
or else you can try a example document',
|
143 |
+
options = ('Upload Document', 'Try Example'),
|
144 |
+
horizontal = True)
|
145 |
+
|
146 |
+
if choice == 'Upload Document':
|
147 |
+
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
148 |
+
if uploaded_file is not None:
|
149 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
150 |
+
bytes_data = uploaded_file.getvalue()
|
151 |
+
temp.write(bytes_data)
|
152 |
+
|
153 |
+
st.write("Uploaded Filename: ", uploaded_file.name)
|
154 |
+
file_name = uploaded_file.name
|
155 |
+
file_path = temp.name
|
156 |
+
docs = pre.load_document(file_path, file_name)
|
157 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
158 |
+
|
159 |
+
else:
|
160 |
+
# listing the options
|
161 |
+
option = st.selectbox('Select the example document',
|
162 |
+
('South Africa:Low Emission strategy',
|
163 |
+
'Ethiopia: 10 Year Development Plan'))
|
164 |
+
if option is 'South Africa:Low Emission strategy':
|
165 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
166 |
+
st.write("Selected document:", file_name.split('/')[1])
|
167 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
168 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
169 |
+
else:
|
170 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
171 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
172 |
+
st.write("Selected document:", file_name.split('/')[1])
|
173 |
+
|
174 |
+
if option is not None:
|
175 |
+
docs = pre.load_document(file_path,file_name)
|
176 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
177 |
+
|
178 |
+
if docs is not None:
|
179 |
+
|
180 |
+
bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
181 |
+
bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
182 |
+
top_k = 32
|
183 |
+
|
184 |
+
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
185 |
+
tokenized_corpus = bm25TokenizeDoc(paraList)
|
186 |
+
document_bm25 = BM25Okapi(tokenized_corpus)
|
187 |
+
keywordList = None
|
188 |
+
|
189 |
+
col1, col2 = st.columns(2)
|
190 |
+
with col1:
|
191 |
+
if st.button('Climate Change Keyword Search'):
|
192 |
+
keywordList = ['extreme weather', 'floods', 'droughts']
|
193 |
+
|
194 |
+
# show_results(keywordList)
|
195 |
+
with col2:
|
196 |
+
if st.button('Gender Keywords Search'):
|
197 |
+
keywordList = ['Gender', 'Women empowernment']
|
198 |
+
|
199 |
+
# show_results(keywordList)
|
200 |
+
|
201 |
+
keyword = st.text_input("Please enter here \
|
202 |
+
what you want to search, \
|
203 |
+
we will look for similar context \
|
204 |
+
in the document.",
|
205 |
+
value="",)
|
206 |
+
if st.button("Find them."):
|
207 |
+
keywordList = [keyword]
|
208 |
+
if keywordList is not None:
|
209 |
+
show_results(keywordList)
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
# @st.cache(allow_output_mutation=True)
|
215 |
+
# def load_sentenceTransformer(name):
|
216 |
+
# return SentenceTransformer(name)
|
217 |
+
|
218 |
+
# bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
219 |
+
# bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
220 |
+
# top_k = 32
|
221 |
+
|
222 |
+
# #@st.cache(allow_output_mutation=True)
|
223 |
+
# #def load_crossEncoder(name):
|
224 |
+
# # return CrossEncoder(name)
|
225 |
+
|
226 |
+
# # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
227 |
+
# document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
228 |
+
|
229 |
+
# def bm25_tokenizer(text):
|
230 |
+
# tokenized_doc = []
|
231 |
+
# for token in text.lower().split():
|
232 |
+
# token = token.strip(string.punctuation)
|
233 |
+
|
234 |
+
# if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
235 |
+
# tokenized_doc.append(token)
|
236 |
+
# return tokenized_doc
|
237 |
+
|
238 |
+
# def bm25TokenizeDoc(paraList):
|
239 |
+
# tokenized_corpus = []
|
240 |
+
# for passage in tqdm(paraList):
|
241 |
+
# if len(passage.split()) >256:
|
242 |
+
# temp = " ".join(passage.split()[:256])
|
243 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
244 |
+
# temp = " ".join(passage.split()[256:])
|
245 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
246 |
+
# else:
|
247 |
+
# tokenized_corpus.append(bm25_tokenizer(passage))
|
248 |
+
|
249 |
+
# return tokenized_corpus
|
250 |
+
|
251 |
+
# tokenized_corpus = bm25TokenizeDoc(paraList)
|
252 |
+
|
253 |
+
|
254 |
+
# document_bm25 = BM25Okapi(tokenized_corpus)
|
255 |
+
|
256 |
+
# # def search(keyword):
|
257 |
+
# # ##### BM25 search (lexical search) #####
|
258 |
+
# # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
259 |
+
# top_n = np.argpartition(bm25_scores, -10)[-10:]
|
260 |
+
# bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
261 |
+
# bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
262 |
+
|
263 |
+
# ##### Sematic Search #####
|
264 |
+
# # Encode the query using the bi-encoder and find potentially relevant passages
|
265 |
+
# #query = "Does document contain {} issues ?".format(keyword)
|
266 |
+
# question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
267 |
+
|
268 |
+
# hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
269 |
+
# hits = hits[0] # Get the hits for the first query
|
270 |
+
|
271 |
+
|
272 |
+
# ##### Re-Ranking #####
|
273 |
+
# # Now, score all retrieved passages with the cross_encoder
|
274 |
+
# #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
275 |
+
# #cross_scores = cross_encoder.predict(cross_inp)
|
276 |
+
|
277 |
+
# # Sort results by the cross-encoder scores
|
278 |
+
# #for idx in range(len(cross_scores)):
|
279 |
+
# # hits[idx]['cross-score'] = cross_scores[idx]
|
280 |
+
|
281 |
+
|
282 |
+
# return bm25_hits, hits
|
283 |
+
|
284 |
+
# def show_results(keywordList):
|
285 |
+
# for keyword in keywordList:
|
286 |
+
# bm25_hits, hits = search(keyword)
|
287 |
+
|
288 |
+
# st.markdown("""
|
289 |
+
# We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
290 |
+
# """)
|
291 |
+
# # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
292 |
+
# st.markdown("Top few lexical search (BM25) hits")
|
293 |
+
# for hit in bm25_hits[0:5]:
|
294 |
+
# if hit['score'] > 0.00:
|
295 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
296 |
+
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
# # st.table(bm25_hits[0:3])
|
302 |
+
|
303 |
+
# st.markdown("\n-------------------------\n")
|
304 |
+
# st.markdown("Top few Bi-Encoder Retrieval hits")
|
305 |
+
|
306 |
+
# hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
307 |
+
# for hit in hits[0:5]:
|
308 |
+
# # if hit['score'] > 0.45:
|
309 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
310 |
+
# #st.table(hits[0:3]
|
311 |
+
|
312 |
+
|
313 |
+
# # if docs is not None:
|
314 |
+
# # col1, col2 = st.columns(2)
|
315 |
+
# # with col1:
|
316 |
+
# # if st.button('Gender Keywords Search'):
|
317 |
+
# # keywordList = ['Gender Equality', 'Women empowernment']
|
318 |
+
# # show_results(keywordList)
|
319 |
+
# # with col2:
|
320 |
+
# # if st.button('Climate Change Keyword Search'):
|
321 |
+
# # keywordList = ['extreme weather', 'floods', 'droughts']
|
322 |
+
# # show_results(keywordList)
|
323 |
+
|
324 |
+
# # keyword = st.text_input("Please enter here \
|
325 |
+
# # what you want to search, \
|
326 |
+
# # we will look for similar context \
|
327 |
+
# # in the document.",
|
328 |
+
# # value="",)
|
329 |
+
# # if st.button("Find them."):
|
330 |
+
# # show_results([keyword])
|
331 |
+
|
332 |
+
|
333 |
+
# choice1 = st.radio(label = 'Keyword Search',
|
334 |
+
# help = 'Search \
|
335 |
+
# or else you can try a example document',
|
336 |
+
# options = ('Enter your own Query', 'Try Example'),
|
337 |
+
# horizontal = True)
|
338 |
+
|
339 |
+
# if choice1 == 'Enter your own Query':
|
340 |
+
# keyword = st.text_input("Please enter here \
|
341 |
+
# what you want to search, \
|
342 |
+
# we will look for similar context \
|
343 |
+
# in the document.",
|
344 |
+
# value="",)
|
345 |
+
# else:
|
346 |
+
# option1 = st.selectbox('Select the Predefined word cluster',
|
347 |
+
# ('Gender:[Gender Equality, Women empowernment]',
|
348 |
+
# 'Climate change:[extreme weather, floods, droughts]',
|
349 |
+
# ))
|
350 |
+
# if option1 == 'Gender:[Gender Equality, Women empowernment]':
|
351 |
+
# keywordList = ['Gender Equality', 'Women empowernment']
|
352 |
+
# else:
|
353 |
+
# keywordList = ['extreme weather', 'floods', 'droughts']
|
354 |
+
|
355 |
+
# option1 = st.selectbox('Select the Predefined word cluster',
|
356 |
+
# ('Gender:[Gender Equality, Women empowernment]',
|
357 |
+
# 'Climate change:[extreme weather, floods, droughts]',
|
358 |
+
# # 'Enter your Own Keyword Query'))
|
359 |
+
# if option1 == 'Enter your Own Keyword Query':
|
360 |
+
# keyword = st.text_input("Please enter here \
|
361 |
+
# what you want to search, \
|
362 |
+
# we will look for similar context \
|
363 |
+
# in the document.",
|
364 |
+
# value="",)
|
365 |
+
# elif option1 == 'Gender:[Gender Equality, Women empowernment]':
|
366 |
+
# keywordList = ['Gender Equality', 'Women empowernment']
|
367 |
+
# elif option1 == 'Climate change:[extreme weather, floods, droughts]':
|
368 |
+
# keywordList = ['extreme weather', 'floods', 'droughts']
|
369 |
+
|
370 |
+
|
371 |
+
# st.markdown("### 📌 Step Two: Search Keyword in Document ### ")
|
372 |
+
|
373 |
+
|
374 |
+
# @st.cache(allow_output_mutation=True)
|
375 |
+
# def load_sentenceTransformer(name):
|
376 |
+
# return SentenceTransformer(name)
|
377 |
+
|
378 |
+
# bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
|
379 |
+
# bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
|
380 |
+
# top_k = 32
|
381 |
+
|
382 |
+
# #@st.cache(allow_output_mutation=True)
|
383 |
+
# #def load_crossEncoder(name):
|
384 |
+
# # return CrossEncoder(name)
|
385 |
+
|
386 |
+
# # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
387 |
+
# document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
388 |
+
|
389 |
+
# def bm25_tokenizer(text):
|
390 |
+
# tokenized_doc = []
|
391 |
+
# for token in text.lower().split():
|
392 |
+
# token = token.strip(string.punctuation)
|
393 |
+
|
394 |
+
# if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
395 |
+
# tokenized_doc.append(token)
|
396 |
+
# return tokenized_doc
|
397 |
+
|
398 |
+
# def bm25TokenizeDoc(paraList):
|
399 |
+
# tokenized_corpus = []
|
400 |
+
# for passage in tqdm(paraList):
|
401 |
+
# if len(passage.split()) >256:
|
402 |
+
# temp = " ".join(passage.split()[:256])
|
403 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
404 |
+
# temp = " ".join(passage.split()[256:])
|
405 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
406 |
+
# else:
|
407 |
+
# tokenized_corpus.append(bm25_tokenizer(passage))
|
408 |
+
|
409 |
+
# return tokenized_corpus
|
410 |
+
|
411 |
+
# tokenized_corpus = bm25TokenizeDoc(paraList)
|
412 |
+
|
413 |
+
|
414 |
+
# document_bm25 = BM25Okapi(tokenized_corpus)
|
415 |
+
|
416 |
+
|
417 |
+
# def search(keyword):
|
418 |
+
# ##### BM25 search (lexical search) #####
|
419 |
+
# bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
420 |
+
# top_n = np.argpartition(bm25_scores, -10)[-10:]
|
421 |
+
# bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
422 |
+
# bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
423 |
+
|
424 |
+
# ##### Sematic Search #####
|
425 |
+
# # Encode the query using the bi-encoder and find potentially relevant passages
|
426 |
+
# #query = "Does document contain {} issues ?".format(keyword)
|
427 |
+
# question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
|
428 |
+
|
429 |
+
# hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
430 |
+
# hits = hits[0] # Get the hits for the first query
|
431 |
+
|
432 |
+
|
433 |
+
# ##### Re-Ranking #####
|
434 |
+
# # Now, score all retrieved passages with the cross_encoder
|
435 |
+
# #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
|
436 |
+
# #cross_scores = cross_encoder.predict(cross_inp)
|
437 |
+
|
438 |
+
# # Sort results by the cross-encoder scores
|
439 |
+
# #for idx in range(len(cross_scores)):
|
440 |
+
# # hits[idx]['cross-score'] = cross_scores[idx]
|
441 |
+
|
442 |
+
|
443 |
+
# return bm25_hits, hits
|
444 |
+
|
445 |
+
# def show_results(keywordList):
|
446 |
+
# for keyword in keywordList:
|
447 |
+
# bm25_hits, hits = search(keyword)
|
448 |
+
|
449 |
+
# st.markdown("""
|
450 |
+
# We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
451 |
+
# """)
|
452 |
+
# # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
453 |
+
# st.markdown("Top few lexical search (BM25) hits")
|
454 |
+
# for hit in bm25_hits[0:5]:
|
455 |
+
# if hit['score'] > 0.00:
|
456 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
457 |
+
|
458 |
+
|
459 |
+
|
460 |
+
|
461 |
+
|
462 |
+
# # st.table(bm25_hits[0:3])
|
463 |
+
|
464 |
+
# st.markdown("\n-------------------------\n")
|
465 |
+
# st.markdown("Top few Bi-Encoder Retrieval hits")
|
466 |
+
|
467 |
+
# hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
468 |
+
# for hit in hits[0:5]:
|
469 |
+
# # if hit['score'] > 0.45:
|
470 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
471 |
+
# #st.table(hits[0:3]
|
472 |
+
|
473 |
+
|
474 |
+
|
475 |
+
|
476 |
+
# # if st.button("Find them."):
|
477 |
+
# # bm25_hits, hits = search(keyword)
|
478 |
+
|
479 |
+
# # st.markdown("""
|
480 |
+
# # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
481 |
+
# # """)
|
482 |
+
# # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
483 |
+
# # st.markdown("Top few lexical search (BM25) hits")
|
484 |
+
# # for hit in bm25_hits[0:5]:
|
485 |
+
# # if hit['score'] > 0.00:
|
486 |
+
# # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
487 |
+
|
488 |
+
|
489 |
+
|
490 |
+
|
491 |
+
|
492 |
+
# # # st.table(bm25_hits[0:3])
|
493 |
+
|
494 |
+
# # st.markdown("\n-------------------------\n")
|
495 |
+
# # st.markdown("Top few Bi-Encoder Retrieval hits")
|
496 |
+
|
497 |
+
# # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
498 |
+
# # for hit in hits[0:5]:
|
499 |
+
# # # if hit['score'] > 0.45:
|
500 |
+
# # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
501 |
+
# # #st.table(hits[0:3]
|
502 |
+
|
503 |
+
|
504 |
+
|
appStore/multiapp.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
+
"""
|
3 |
+
import streamlit as st
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
class MultiApp:
|
7 |
+
"""Framework for combining multiple streamlit applications.
|
8 |
+
Usage:
|
9 |
+
def foo():
|
10 |
+
st.title("Hello Foo")
|
11 |
+
def bar():
|
12 |
+
st.title("Hello Bar")
|
13 |
+
app = MultiApp()
|
14 |
+
app.add_app("Foo", foo)
|
15 |
+
app.add_app("Bar", bar)
|
16 |
+
app.run()
|
17 |
+
It is also possible keep each application in a separate file.
|
18 |
+
import foo
|
19 |
+
import bar
|
20 |
+
app = MultiApp()
|
21 |
+
app.add_app("Foo", foo.app)
|
22 |
+
app.add_app("Bar", bar.app)
|
23 |
+
app.run()
|
24 |
+
"""
|
25 |
+
def __init__(self):
|
26 |
+
self.apps = []
|
27 |
+
|
28 |
+
def add_app(self, title, func):
|
29 |
+
"""Adds a new application.
|
30 |
+
Parameters
|
31 |
+
----------
|
32 |
+
func:
|
33 |
+
the python function to render this app.
|
34 |
+
title:
|
35 |
+
title of the app. Appears in the dropdown in the sidebar.
|
36 |
+
"""
|
37 |
+
self.apps.append({
|
38 |
+
"title": title,
|
39 |
+
"function": func
|
40 |
+
})
|
41 |
+
|
42 |
+
def run(self):
|
43 |
+
st.sidebar.write(format_func=lambda app: app['title'])
|
44 |
+
image = Image.open('appStore/img/giz_sdsn.jpg')
|
45 |
+
st.sidebar.image(image)
|
46 |
+
app = st.sidebar.radio(
|
47 |
+
'Go To',
|
48 |
+
self.apps,
|
49 |
+
format_func=lambda app: app['title'])
|
50 |
+
|
51 |
+
app['function']()
|
appStore/sdg_analysis.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from keybert import KeyBERT
|
12 |
+
from transformers import pipeline
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import numpy as np
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
|
18 |
+
import tempfile
|
19 |
+
import sqlite3
|
20 |
+
|
21 |
+
def app():
|
22 |
+
|
23 |
+
with st.container():
|
24 |
+
st.markdown("<h1 style='text-align: center; color: black;'> SDSN x GIZ Policy Action Tracking v0.1</h1>", unsafe_allow_html=True)
|
25 |
+
st.write(' ')
|
26 |
+
st.write(' ')
|
27 |
+
|
28 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
29 |
+
|
30 |
+
st.write(
|
31 |
+
"""
|
32 |
+
The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
|
33 |
+
1. Keyword heatmap \n
|
34 |
+
2. SDG Classification for the paragraphs/texts in the document
|
35 |
+
"""
|
36 |
+
)
|
37 |
+
|
38 |
+
st.markdown("")
|
39 |
+
|
40 |
+
st.markdown("")
|
41 |
+
st.markdown("## 📌 Step One: Upload document ")
|
42 |
+
|
43 |
+
with st.container():
|
44 |
+
|
45 |
+
|
46 |
+
docs = None
|
47 |
+
# asking user for either upload or select existing doc
|
48 |
+
choice = st.radio(label = 'Select the Document',
|
49 |
+
help = 'You can upload the document \
|
50 |
+
or else you can try a example document',
|
51 |
+
options = ('Upload Document', 'Try Example'),
|
52 |
+
horizontal = True)
|
53 |
+
|
54 |
+
if choice == 'Upload Document':
|
55 |
+
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
56 |
+
if uploaded_file is not None:
|
57 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
58 |
+
bytes_data = uploaded_file.getvalue()
|
59 |
+
temp.write(bytes_data)
|
60 |
+
|
61 |
+
st.write("Uploaded Filename: ", uploaded_file.name)
|
62 |
+
file_name = uploaded_file.name
|
63 |
+
file_path = temp.name
|
64 |
+
docs = pre.load_document(file_path, file_name)
|
65 |
+
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
66 |
+
#haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
67 |
+
|
68 |
+
else:
|
69 |
+
# listing the options
|
70 |
+
option = st.selectbox('Select the example document',
|
71 |
+
('Ethiopia: 10 Year Development Plan',
|
72 |
+
'South Africa:Low Emission strategy'))
|
73 |
+
if option is 'South Africa:Low Emission strategy':
|
74 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
75 |
+
st.write("Selected document:", file_name.split('/')[1])
|
76 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
77 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
78 |
+
else:
|
79 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
80 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
81 |
+
st.write("Selected document:", file_name.split('/')[1])
|
82 |
+
|
83 |
+
if option is not None:
|
84 |
+
docs = pre.load_document(file_path,file_name)
|
85 |
+
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
86 |
+
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
if docs is not None:
|
91 |
+
|
92 |
+
@st.cache(allow_output_mutation=True)
|
93 |
+
def load_keyBert():
|
94 |
+
return KeyBERT()
|
95 |
+
|
96 |
+
kw_model = load_keyBert()
|
97 |
+
|
98 |
+
keywords = kw_model.extract_keywords(
|
99 |
+
all_text,
|
100 |
+
keyphrase_ngram_range=(1, 2),
|
101 |
+
use_mmr=True,
|
102 |
+
stop_words="english",
|
103 |
+
top_n=15,
|
104 |
+
diversity=0.7,
|
105 |
+
)
|
106 |
+
|
107 |
+
st.markdown("## 🎈 What is my document about?")
|
108 |
+
|
109 |
+
df = (
|
110 |
+
DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
111 |
+
.sort_values(by="Relevancy", ascending=False)
|
112 |
+
.reset_index(drop=True)
|
113 |
+
)
|
114 |
+
|
115 |
+
df.index += 1
|
116 |
+
|
117 |
+
# Add styling
|
118 |
+
cmGreen = sns.light_palette("green", as_cmap=True)
|
119 |
+
cmRed = sns.light_palette("red", as_cmap=True)
|
120 |
+
df = df.style.background_gradient(
|
121 |
+
cmap=cmGreen,
|
122 |
+
subset=[
|
123 |
+
"Relevancy",
|
124 |
+
],
|
125 |
+
)
|
126 |
+
c1, c2, c3 = st.columns([1, 3, 1])
|
127 |
+
|
128 |
+
format_dictionary = {
|
129 |
+
"Relevancy": "{:.1%}",
|
130 |
+
}
|
131 |
+
|
132 |
+
df = df.format(format_dictionary)
|
133 |
+
|
134 |
+
with c2:
|
135 |
+
st.table(df)
|
136 |
+
|
137 |
+
######## SDG classiciation
|
138 |
+
# @st.cache(allow_output_mutation=True)
|
139 |
+
# def load_sdgClassifier():
|
140 |
+
# classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
|
141 |
+
|
142 |
+
# return classifier
|
143 |
+
|
144 |
+
# load from disc (github repo) for performance boost
|
145 |
+
@st.cache(allow_output_mutation=True)
|
146 |
+
def load_sdgClassifier():
|
147 |
+
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
148 |
+
|
149 |
+
return classifier
|
150 |
+
|
151 |
+
classifier = load_sdgClassifier()
|
152 |
+
|
153 |
+
# # not needed, par list comes from pre_processing function already
|
154 |
+
|
155 |
+
# word_list = all_text.split()
|
156 |
+
# len_word_list = len(word_list)
|
157 |
+
# par_list = []
|
158 |
+
# par_len = 130
|
159 |
+
# for i in range(0,len_word_list // par_len):
|
160 |
+
# string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
|
161 |
+
# par_list.append(string_part)
|
162 |
+
|
163 |
+
labels = classifier(par_list)
|
164 |
+
labels_= [(l['label'],l['score']) for l in labels]
|
165 |
+
df = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
166 |
+
df['text'] = par_list
|
167 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
168 |
+
df.index += 1
|
169 |
+
df =df[df['Relevancy']>.85]
|
170 |
+
x = df['SDG'].value_counts()
|
171 |
+
|
172 |
+
plt.rcParams['font.size'] = 25
|
173 |
+
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
174 |
+
# plot
|
175 |
+
fig, ax = plt.subplots()
|
176 |
+
ax.pie(x, colors=colors, radius=2, center=(4, 4),
|
177 |
+
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
|
178 |
+
|
179 |
+
st.markdown("## 🎈 Anything related to SDGs?")
|
180 |
+
|
181 |
+
c4, c5, c6 = st.columns([5, 7, 1])
|
182 |
+
|
183 |
+
# Add styling
|
184 |
+
cmGreen = sns.light_palette("green", as_cmap=True)
|
185 |
+
cmRed = sns.light_palette("red", as_cmap=True)
|
186 |
+
df = df.style.background_gradient(
|
187 |
+
cmap=cmGreen,
|
188 |
+
subset=[
|
189 |
+
"Relevancy",
|
190 |
+
],
|
191 |
+
)
|
192 |
+
|
193 |
+
format_dictionary = {
|
194 |
+
"Relevancy": "{:.1%}",
|
195 |
+
}
|
196 |
+
|
197 |
+
df = df.format(format_dictionary)
|
198 |
+
|
199 |
+
with c4:
|
200 |
+
st.pyplot(fig)
|
201 |
+
with c5:
|
202 |
+
st.table(df)
|
203 |
+
|
204 |
+
|
ndcs/cca.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"climate_risks_droughts": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
2 |
+
1: "Droughts are not climate risks concerns",
|
3 |
+
2: "Droughts are among the five climate risks concerns"}},
|
4 |
+
"climate_risks_extreme_weather": {"category": "climate change adaptation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
|
5 |
+
1: "Extreme Weathers are not climate risks concerns",
|
6 |
+
2: "Extreme Weathers are among the five climate risks concerns"}},
|
7 |
+
"climate_risks_floods": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
8 |
+
1: "Floods are not climate risks concerns",
|
9 |
+
2: "Floods are among the five climate risks concerns"}},
|
10 |
+
"climate_risks_temp_increase": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
11 |
+
1: "Temperature increase are not climate risks concerns",
|
12 |
+
2: "Temperature increase are among the five climate risks concerns"}},
|
13 |
+
"climate_risks_sea_level_rise": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
14 |
+
1: "Sea level rise is not a climate risks concerns",
|
15 |
+
2: "Sea level rise is among the five climate risks concerns"}},
|
16 |
+
|
17 |
+
"priority_sectors_agriculture": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
18 |
+
1: "Agricultural sector is not that important in the context of adaptation ambitions",
|
19 |
+
2: "In the context of adaptation ambitions Agricultural sector is very important for the country",
|
20 |
+
3: "Agriculture sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
21 |
+
|
22 |
+
"priority_sectors_ecosystems": {"category": "climate change adaptation","id": {0 :"(I)NDC not submitted or not yet included in analysis",
|
23 |
+
1 :"Biodiversity and preservation of Ecosystems is not that important in the context of adaptation ambitions",
|
24 |
+
2: "In the context of adaptation ambitions Biodiversity and preservation of Ecosystems is very important for the country",
|
25 |
+
3: "Biodiversity and Ecosystems plays an importance for the country, and therefore in the adaptation ambitions Biodiversity and Ecosystems has special actions and aims"}},
|
26 |
+
"priority_sectors_forestry": {"category": "climate change adaptation", "id": {0: "(I)NDC not submitted or not yet included in analysis",
|
27 |
+
1: "Forestry sector is not that important in the context of adaptation ambitions",
|
28 |
+
2: "In the context of adaptation ambitions Forestry sector is very important for the country",
|
29 |
+
3: "Forestry sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
30 |
+
"priority_sectors_health": {"category": "climate change adaptation","id": { 0: "(I)NDC not submitted or not yet included in analysis",
|
31 |
+
1: "Health sector is not that important in the context of adaptation ambitions",
|
32 |
+
2: "In the context of adaptation ambitions Health sector is very important for the country",
|
33 |
+
3: "Health sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
34 |
+
|
35 |
+
"priority_sectors_water": {"category": "climate change adaptation","id": { 0 : "(I)NDC not submitted or not yet included in analysis",
|
36 |
+
1: "Water sector is not that important in the context of adaptation ambitions",
|
37 |
+
2: "In the context of adaptation ambitions Water sector is very important for the country",
|
38 |
+
3: "Water sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
39 |
+
|
40 |
+
"vulnerability_agriculture": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
41 |
+
1: "Agriculture is a not a vulnerable sector",
|
42 |
+
2: "Agriculture is a vulnerable sector"}},
|
43 |
+
"vulnerability_coastal_zones": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
44 |
+
1: "Coastal Zone is a not a vulnerable sector",
|
45 |
+
2: "Coastal Zone is a vulnerable sector"}},
|
46 |
+
"vulnerability_ecosystems": {"category": "climate change adaptation", "id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
47 |
+
1: "Biodiversity and Ecosystems is a not a vulnerable sector",
|
48 |
+
2: "Biodiversity and Ecosystems is a vulnerable sector"}},
|
49 |
+
"vulnerability_health": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
50 |
+
1: "Health is a not a vulnerable sector",
|
51 |
+
2: "Health is a vulnerable sector"}},
|
52 |
+
"vulnerability_water": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
53 |
+
1: "Water is a not a vulnerable sector",
|
54 |
+
2: "Water is a vulnerable sector"}},
|
55 |
+
|
56 |
+
"costs_of_adaptation": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
57 |
+
1: "The partial cost of adaptation is tentatively around few million dollars",
|
58 |
+
2: " The cost of adaptation will be 0-1 billion US$ until 2030",
|
59 |
+
3: " The cost of adaptation will be 1-5 billion US$ until 2030",
|
60 |
+
4: " The cost of adaptation will be 5-10 billion US$ until 2030",
|
61 |
+
5: " The cost of adaptation will be 10-20 billion US$ until 2030",
|
62 |
+
6: "The cost of adaptation will be more than 20 billion US$ until 2030"}},
|
63 |
+
"costs_of_future_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
64 |
+
1: "The future losses from climate change will be huge",
|
65 |
+
2: "The climate hazards cause significant loss to economy and life, and the cost of Future losses could go around few million dollars"}},
|
66 |
+
|
67 |
+
"costs_of_recent_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
68 |
+
1: "No losses indicated",
|
69 |
+
2: "In the recent climate hazards there has been significant Economic losses.",
|
70 |
+
3: "In the recent climate hazards the impact on human life has been significant",
|
71 |
+
4: "In the recent climate hazards the impact on human life has been significant and the economic loss amounts to 5.3"}},
|
72 |
+
"quantified_adaptation_targets": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
73 |
+
1:"No quantitative adaptation target",
|
74 |
+
2: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
|
75 |
+
3: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
|
76 |
+
4: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years"}},
|
77 |
+
|
78 |
+
"slow_onset_others": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
79 |
+
1:"Apart from sea level rise and temperature increase, no other specific slow onset process",
|
80 |
+
2: "There are other slow onset processes additional to sea level rise and temperature increase like loss of biodiversity, desertification, glacier retreat, salinisation or ocean acidification"}},
|
81 |
+
}
|
ndcs/ccm.txt
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"agriculture": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
2 |
+
1: "Agriculture sector is not considered for climate change mitigation",
|
3 |
+
2: "Agriculture sector contribution in greenhouse gases emission is significant and therefore is part of climate change mitigation",
|
4 |
+
3: "Agriculture sector contribution in greenhouse gases emission is significant. Given the importance of agriculture sector for economy and and its adverse contribution in greenhouse gas emissions it is a Focus area for climate change mitigation and needs to be prioritised"}},
|
5 |
+
|
6 |
+
"energy_efficiency": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
7 |
+
1: "Energy Efficiency is not considered for climate change mitigation",
|
8 |
+
2: "Energy sector contribution in greenhouse gases emission is significant and therefore Energy Efficiency is part of climate change mitigation",
|
9 |
+
3: "Energy sector contribution in greenhouse gases emission is significant. Given the importance of the energy sector for economy and its adverse contribution to greenhouse gas emissions, energy efficiency is a Focus area for climate change mitigation and needs to be prioritised. The quantified renewable energy targets like for example in solar, geothermal, wind power are provided."}},
|
10 |
+
|
11 |
+
"fossil_fuel_production": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
12 |
+
1:"There is no recorded FFP (2016)",
|
13 |
+
2: "Fossil fuel Production is important for economy",
|
14 |
+
3:"Fossil fuel Production is important to provide for the basic requirements of the people in the country",
|
15 |
+
4:"The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to the same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production",
|
16 |
+
5: "Fossil fuel Production is important to provide for the basic requirements of the people in the country.The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production"}},
|
17 |
+
"fossil_fuel_subsidiaries": {"category": "climate change mitigation","id":{0: "(I)NDC not submitted or not yet included in analysis",
|
18 |
+
1:"fossil Fuel subsidiaries are not considered",
|
19 |
+
2:"the alternates/subsidiaries to fossil Fuel need to be considered to meet the mitigations ambitions",
|
20 |
+
3:"The fossil fuel contribution towards greenhouse gas emissions is very high and therefore there is a need to find the alternatives/substitutes for the same. The replacement of fossil fuels with alternates is a priority focus area in the mitigation actions to meet mitigation ambitions."}},
|
21 |
+
|
22 |
+
"land_use_and_forestry": {"category": "climate change mitigation", "id":{0:"(I)NDC not submitted or not yet included in analysis",
|
23 |
+
1:"land use and forestry are not considered",
|
24 |
+
2:"the land use and forestry contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
25 |
+
3:"The land use and forestry contribution towards greenhouse gas emissions is significant and therefore there is need to quantify the mitigation potential land use and forestry."}},
|
26 |
+
"land_use_change": {"category": "climate change mitigation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
|
27 |
+
1: "land use change Not mentioned",
|
28 |
+
2: "land use change is being considered, but there are no mitigation targets",
|
29 |
+
3: "land use change is being considered as part of mitigation targets",
|
30 |
+
4: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change.",
|
31 |
+
5: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change."}},
|
32 |
+
|
33 |
+
"renewable_energy": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
34 |
+
1:"renewable energy is not considered",
|
35 |
+
2:"Renewable energy are direct measure to reduce the greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
36 |
+
3:"Renewable energy are direct measure to reduce the greenhouse gas emissions and therefore there is need to quantify the mitigation potential in terms of renewable energy targets and specific sub-sectors of action (e.g. solar, geothermal, wind power)"}},
|
37 |
+
|
38 |
+
"temp_target": {"category": "climate change mitigation", "id": { 0: "(I)NDC not submitted or not yet included in analysis",
|
39 |
+
1:"Not mentioning global effort to limit global temperature increase to 2 degree celsius or 1.5 degree C",
|
40 |
+
2:"there is urgent need to limit global temperature increase to 2 degree celsius",
|
41 |
+
3:"there is urgent need to limit global temperature increase to 1.5 degree C",
|
42 |
+
4:"there is urgent need to limit global temperature increase to 2 degree celsius",
|
43 |
+
5:"there is urgent need to limit global temperature increase to 1.5 degree C"}},
|
44 |
+
"waste": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
45 |
+
1:"Waste as a topic is not mentioned",
|
46 |
+
2:"Waste reduction or management can play important role in mitigation plan and ambitions",
|
47 |
+
3:"Waste reduction or management can play an important role in sustainable development and hence is a focus area in mitigation plan and ambitions"}},
|
48 |
+
"transport": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
49 |
+
1:"Transport is not considered",
|
50 |
+
2:"Transport contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
51 |
+
3:"transport sector contribution towards greenhouse gas emissions is significant and therefore there is need to focus/prioritise the transport sector to meet the mitigation potential"}},
|
52 |
+
|
53 |
+
"reducing_non_co2_gases": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
54 |
+
1:"Reduction of non CO2 gases not indicated",
|
55 |
+
2:"Efforts should be made in reduction of NOn CO2 gases too."}},
|
56 |
+
|
57 |
+
|
58 |
+
"base_year": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
59 |
+
1: "No base year",
|
60 |
+
2: "the base year or reference point for measurement of emissions is year 19XX"}},
|
61 |
+
|
62 |
+
"carbon_capture_and_storage": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
63 |
+
1: "carbon capture and storage not indicated",
|
64 |
+
2:"With Technology advancement the mitigation efforts can also in form of carbon capture and storage.",
|
65 |
+
3: "With technological advancement the mitigation efforts can also be in form of carbon capture and storage. This should be a focus area and more options need to be explored to do carbon capture and storage."}},
|
66 |
+
|
67 |
+
"costs_of_ccm": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
68 |
+
1: "(partial) costs not indicated",
|
69 |
+
2: " the mitigation actions and efforts will cost 0-1 billion US$ until 2030",
|
70 |
+
3:"the mitigation actions and efforts will cost 1-5 billion US$ until 2030",
|
71 |
+
4:"the mitigation actions and efforts will cost5-10 billion US$ until 2030",
|
72 |
+
5: "the mitigation actions and efforts will cost 10-20 billion US$ until 2030",
|
73 |
+
6:"the mitigation actions and efforts will cost will be more than 20 billion US$ until 2030"}},
|
74 |
+
|
75 |
+
"market_mechanisms": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
76 |
+
1: "International market mechanisms not mentioned",
|
77 |
+
2:"One good mechanism to deal with greenhouse gas emissions is to explore International market mechanisms",
|
78 |
+
3: "International market mechanisms are not a good way of dealing with mitigation ambitions and therefore should not be considered. Greenhouse gas emissions cannot be part of tradable commodity.",
|
79 |
+
4: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target",
|
80 |
+
5: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target"}},
|
81 |
+
|
82 |
+
"redd": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
83 |
+
1: "REDD+ not mentioned",
|
84 |
+
2: "Reducing Emissions of Deforestation and Forest Degradation/REDD+",
|
85 |
+
3: "Reducing Emissions of Deforestation and Forest Degradation/REDD+"}},
|
86 |
+
}
|
ndcs/countryList.txt
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{'Afghanistan': 'AFG',
|
2 |
+
'Albania': 'ALB',
|
3 |
+
'Algeria': 'DZA',
|
4 |
+
'Andorra': 'AND',
|
5 |
+
'Angola': 'AGO',
|
6 |
+
'Antigua and Barbuda': 'ATG',
|
7 |
+
'Argentina': 'ARG',
|
8 |
+
'Armenia': 'ARM',
|
9 |
+
'Australia': 'AUS',
|
10 |
+
'Azerbaijan': 'AZE',
|
11 |
+
'Bahamas': 'BHS',
|
12 |
+
'Bahrain': 'BHR',
|
13 |
+
'Bangladesh': 'BGD',
|
14 |
+
'Barbados': 'BRB',
|
15 |
+
'Belarus': 'BLR',
|
16 |
+
'Belize': 'BLZ',
|
17 |
+
'Benin': 'BEN',
|
18 |
+
'Bhutan': 'BTN',
|
19 |
+
'Bolivia': 'BOL',
|
20 |
+
'Bosnia and Herzegovina': 'BIH',
|
21 |
+
'Botswana': 'BWA',
|
22 |
+
'Brazil ': 'BRA',
|
23 |
+
'Brunei Darussalam': 'BRN',
|
24 |
+
'Burkina Faso': 'BFA',
|
25 |
+
'Burundi ': 'BDI',
|
26 |
+
'Cabo Verde': 'CPV',
|
27 |
+
'Cambodia': 'KHM',
|
28 |
+
'Cameroon': 'CMR',
|
29 |
+
'Canada': 'CAN',
|
30 |
+
'Central African Republic': 'CAF',
|
31 |
+
'Chad': 'TCD',
|
32 |
+
'Chile': 'CHL',
|
33 |
+
'China': 'CHN',
|
34 |
+
'Colombia': 'COL',
|
35 |
+
'Comoros': 'COM',
|
36 |
+
'Congo': 'COG',
|
37 |
+
'Cook Islands': 'COK',
|
38 |
+
'Costa Rica': 'CRI',
|
39 |
+
'Cote dIvoire': 'CIV',
|
40 |
+
'Cuba': 'CUB',
|
41 |
+
"Democratic People's Republic of Korea": 'PRK',
|
42 |
+
'Democratic Republic of Congo': 'COD',
|
43 |
+
'Djibouti': 'DJI',
|
44 |
+
'Dominica': 'DMA',
|
45 |
+
'Dominican Republic': 'DOM',
|
46 |
+
'Ecuador': 'ECU',
|
47 |
+
'Egypt': 'EGY',
|
48 |
+
'El Salvador': 'SLV',
|
49 |
+
'Equatorial Guinea': 'GNQ',
|
50 |
+
'Eritrea': 'ERI',
|
51 |
+
'Ethiopia': 'ETH',
|
52 |
+
'European Union': 'EU',
|
53 |
+
'Fiji': 'FJI',
|
54 |
+
'Gabon': 'GAB',
|
55 |
+
'Gambia': 'GMB',
|
56 |
+
'Georgia': 'GEO',
|
57 |
+
'Ghana': 'GHA',
|
58 |
+
'Grenada': 'GRD',
|
59 |
+
'Guatemala': 'GTM',
|
60 |
+
'Guinea': 'GIN',
|
61 |
+
'Guinea Bissau': 'GNB',
|
62 |
+
'Guyana': 'GUY',
|
63 |
+
'Haiti': 'HTI',
|
64 |
+
'Honduras': 'HND',
|
65 |
+
'Iceland': 'ISL',
|
66 |
+
'India': 'IND',
|
67 |
+
'Indonesia': 'IDN',
|
68 |
+
'Iran': 'IRN',
|
69 |
+
'Iraq': 'IRQ',
|
70 |
+
'Israel': 'ISR',
|
71 |
+
'Jamaica': 'JAM',
|
72 |
+
'Japan': 'JPN',
|
73 |
+
'Jordan': 'JOR',
|
74 |
+
'Kazakhstan': 'KAZ',
|
75 |
+
'Kenya': 'KEN',
|
76 |
+
'Kingdom of Eswatini': 'SWZ',
|
77 |
+
'Kiribati': 'KIR',
|
78 |
+
'Kuwait': 'KWT',
|
79 |
+
'Kyrgyzstan': 'KGZ',
|
80 |
+
'Lao Peoples Democratic Republic': 'LAO',
|
81 |
+
'Lebanon': 'LBN',
|
82 |
+
'Lesotho': 'LSO',
|
83 |
+
'Liberia': 'LBR',
|
84 |
+
'Libya': 'LBY',
|
85 |
+
'Liechtenstein': 'LIE',
|
86 |
+
'Madagascar': 'MDG',
|
87 |
+
'Malawi': 'MWI',
|
88 |
+
'Malaysia': 'MYS',
|
89 |
+
'Maldives': 'MDV',
|
90 |
+
'Mali': 'MLI',
|
91 |
+
'Marshall Islands': 'MHL',
|
92 |
+
'Mauritania': 'MRT',
|
93 |
+
'Mauritius': 'MUS',
|
94 |
+
'Mexico': 'MEX',
|
95 |
+
'Micronesia': 'FSM',
|
96 |
+
'Monaco': 'MCO',
|
97 |
+
'Mongolia': 'MNG',
|
98 |
+
'Montenegro': 'MNE',
|
99 |
+
'Morocco': 'MAR',
|
100 |
+
'Mozambique': 'MOZ',
|
101 |
+
'Myanmar': 'MMR',
|
102 |
+
'Namibia': 'NAM',
|
103 |
+
'Nauru': 'NRU',
|
104 |
+
'Nepal': 'NPL',
|
105 |
+
'New Zealand': 'NZL',
|
106 |
+
'Nicaragua': 'NIC',
|
107 |
+
'Niger': 'NER',
|
108 |
+
'Nigeria': 'NGA',
|
109 |
+
'Niue': 'NIU',
|
110 |
+
'Norway': 'NOR',
|
111 |
+
'Oman': 'OMN',
|
112 |
+
'Pakistan': 'PAK',
|
113 |
+
'Palau ': 'PLW',
|
114 |
+
'Palestine': 'PSE',
|
115 |
+
'Panama': 'PAN',
|
116 |
+
'Papua New Guinea': 'PNG',
|
117 |
+
'Paraguay': 'PRY',
|
118 |
+
'Peru': 'PER',
|
119 |
+
'Philippines': 'PHL',
|
120 |
+
'Qatar': 'QAT',
|
121 |
+
'Republic of Moldova': 'MDA',
|
122 |
+
'Republic of North Macedonia': 'MKD',
|
123 |
+
'Russian Federation': 'RUS',
|
124 |
+
'Rwanda': 'RWA',
|
125 |
+
'Saint Kitts and Nevis': 'KNA',
|
126 |
+
'Saint Lucia': 'LCA',
|
127 |
+
'Saint Vincent and the Grenadines': 'VCT',
|
128 |
+
'Samoa': 'WSM',
|
129 |
+
'San Marino': 'SMR',
|
130 |
+
'Sao Tome and Principe': 'STP',
|
131 |
+
'Saudi Arabia': 'SAU',
|
132 |
+
'Senegal': 'SEN',
|
133 |
+
'Serbia': 'SRB',
|
134 |
+
'Seychelles': 'SYC',
|
135 |
+
'Sierra Leone': 'SLE',
|
136 |
+
'Singapore': 'SGP',
|
137 |
+
'Solomon Islands': 'SLB',
|
138 |
+
'Somalia': 'SOM',
|
139 |
+
'South Africa': 'ZAF',
|
140 |
+
'South Korea': 'KOR',
|
141 |
+
'South Sudan': 'SSD',
|
142 |
+
'Sri Lanka': 'LKA',
|
143 |
+
'Sudan': 'SDN',
|
144 |
+
'Suriname': 'SUR',
|
145 |
+
'Switzerland': 'CHE',
|
146 |
+
'Syria': 'SYR',
|
147 |
+
'Tajikistan': 'TJK',
|
148 |
+
'Thailand': 'THA',
|
149 |
+
'Timor Leste': 'TLS',
|
150 |
+
'Togo': 'TGO',
|
151 |
+
'Tonga': 'TON',
|
152 |
+
'Trinidad and Tobago': 'TTO',
|
153 |
+
'Tunisia': 'TUN',
|
154 |
+
'Turkey': 'TUR',
|
155 |
+
'Turkmenistan': 'TKM',
|
156 |
+
'Tuvalu': 'TUV',
|
157 |
+
'Uganda': 'UGA',
|
158 |
+
'Ukraine': 'UKR',
|
159 |
+
'United Arab Emirates': 'ARE',
|
160 |
+
'United Kingdom': 'GBR',
|
161 |
+
'United Republic of Tanzania': 'TZA',
|
162 |
+
'United States of America': 'USA',
|
163 |
+
'Uruguay': 'URY',
|
164 |
+
'Uzbekistan': 'UZB',
|
165 |
+
'Vanuatu': 'VUT',
|
166 |
+
'Venezuela ': 'VEN',
|
167 |
+
'Vietnam': 'VNM',
|
168 |
+
'Yemen': 'YEM',
|
169 |
+
'Zambia': 'ZMB',
|
170 |
+
'Zimbabwe': 'ZWE'}
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
poppler-utils
|
2 |
+
xpdf
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
farm-haystack
|
2 |
+
farm-haystack[ocr]
|
3 |
+
spacy==3.2.0
|
4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
5 |
+
keybert==0.5.1
|
6 |
+
matplotlib==3.5.1
|
7 |
+
nltk==3.7
|
8 |
+
numpy==1.22.1
|
9 |
+
pandas==1.4.0
|
10 |
+
pdfplumber==0.6.2
|
11 |
+
Pillow==9.1.1
|
12 |
+
seaborn==0.11.2
|
13 |
+
transformers==4.13.0
|
14 |
+
rank_bm25
|
sample/Ethiopia_s_2021_10 Year Development Plan.txt
ADDED
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Ethiopia 2030: The Pathway to Prosperity
|
2 |
+
Ten Years Perspective Development Plan (2021 � 2030)
|
3 |
+
1. Baselines and Assumptions
|
4 |
+
2. Strategic pillars
|
5 |
+
3. Departures
|
6 |
+
4. Macroeconomic goals
|
7 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
8 |
+
6. Potentials/capabilities
|
9 |
+
7. Focus areas
|
10 |
+
7.1. Productive sectors
|
11 |
+
7.2. Services sector
|
12 |
+
7.3. Enabling sectors
|
13 |
+
8. Balanced and competitive development (nationally, regionally and locally)
|
14 |
+
9. Monitoring and Evaluation
|
15 |
+
Content
|
16 |
+
1. Baselines and Assumptions
|
17 |
+
Poverty Reduction (%)
|
18 |
+
Key performances of previous years
|
19 |
+
45.5 44.2
|
20 |
+
38.7
|
21 |
+
29.6
|
22 |
+
23.5
|
23 |
+
19
|
24 |
+
0
|
25 |
+
5
|
26 |
+
10
|
27 |
+
15
|
28 |
+
20
|
29 |
+
25
|
30 |
+
30
|
31 |
+
35
|
32 |
+
40
|
33 |
+
45
|
34 |
+
50
|
35 |
+
1994 2000 2005 2011 2016 2020
|
36 |
+
Percent
|
37 |
+
Year
|
38 |
+
Proportion of people living below poverty line
|
39 |
+
10.5
|
40 |
+
8.8
|
41 |
+
10.1
|
42 |
+
7.7
|
43 |
+
9
|
44 |
+
5.19-6.20
|
45 |
+
0 2 4 6 8 10 12
|
46 |
+
GTP I: 2011-2015
|
47 |
+
GTP II: 2015/16
|
48 |
+
GTP II: 2016/17
|
49 |
+
GTP II: 2017/18
|
50 |
+
GTP II: 2018/19
|
51 |
+
GTP II: 2019/20 (projection, with
|
52 |
+
COVID-19)
|
53 |
+
GDP growth rate (%)
|
54 |
+
1. Baselines and Assumptions
|
55 |
+
Share of economic sectors in GDP (%) Merchandise export as % of GDP
|
56 |
+
8.66
|
57 |
+
7.33
|
58 |
+
6.57
|
59 |
+
5.93
|
60 |
+
4.91
|
61 |
+
3.86 3.56 3.37
|
62 |
+
2.77
|
63 |
+
0
|
64 |
+
1
|
65 |
+
2
|
66 |
+
3
|
67 |
+
4
|
68 |
+
5
|
69 |
+
6
|
70 |
+
7
|
71 |
+
8
|
72 |
+
9
|
73 |
+
10
|
74 |
+
Percent
|
75 |
+
Year
|
76 |
+
46.9
|
77 |
+
45
|
78 |
+
43.5
|
79 |
+
41.4
|
80 |
+
39.5
|
81 |
+
37.1 35.9
|
82 |
+
34.5
|
83 |
+
32.8
|
84 |
+
13.4
|
85 |
+
15
|
86 |
+
17.3
|
87 |
+
18.8
|
88 |
+
21
|
89 |
+
23.5
|
90 |
+
25.7 26.9 27.8
|
91 |
+
4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
|
92 |
+
7.1
|
93 |
+
8.6
|
94 |
+
10.7 12
|
95 |
+
14.2
|
96 |
+
16.2
|
97 |
+
17.8 19.1 20.1
|
98 |
+
39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
|
99 |
+
0
|
100 |
+
5
|
101 |
+
10
|
102 |
+
15
|
103 |
+
20
|
104 |
+
25
|
105 |
+
30
|
106 |
+
35
|
107 |
+
40
|
108 |
+
45
|
109 |
+
50
|
110 |
+
2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
|
111 |
+
Percent
|
112 |
+
Agriculture Industry Manufacturing Construction Services
|
113 |
+
1. Baselines and Assumptions
|
114 |
+
Labour force participation (2013)
|
115 |
+
73%
|
116 |
+
7%
|
117 |
+
20%
|
118 |
+
Agriculture
|
119 |
+
Industry
|
120 |
+
Services
|
121 |
+
7%
|
122 |
+
22%
|
123 |
+
71%
|
124 |
+
Agriculture
|
125 |
+
Industry
|
126 |
+
Services
|
127 |
+
Urban labour force participation (2013)
|
128 |
+
1. Baselines and Assumptions
|
129 |
+
High and increasing Unemployment Rate
|
130 |
+
� Urban unemployment rate = 19.1% in 2018
|
131 |
+
� Youth unemployment rate = 25.3 %
|
132 |
+
? Male = 18.6%
|
133 |
+
? Female 30.9 %
|
134 |
+
� Rural unemployment rate = 2% in 2013
|
135 |
+
� Declining per capita rural land creating
|
136 |
+
disguised unemployment
|
137 |
+
402,869
|
138 |
+
471,535
|
139 |
+
Male Female Total Male Female Total
|
140 |
+
2014 2018
|
141 |
+
15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
|
142 |
+
Number of unemployed people in urban areas
|
143 |
+
1. Baselines and Assumptions
|
144 |
+
Challenges
|
145 |
+
1. Macroeconomic imbalances
|
146 |
+
?Sustained high inflation
|
147 |
+
?High and rising unemployment especially
|
148 |
+
in urban areas
|
149 |
+
?High and rising debt burden
|
150 |
+
?Chronic foreign currency shortage
|
151 |
+
?Sluggish (though encouraging) rate of
|
152 |
+
structural change
|
153 |
+
2. Vulnerability to shocks (COVID-19, Climate
|
154 |
+
changes, Desert Locust infestation, etc)
|
155 |
+
3. Poor quality and high inequity in
|
156 |
+
infrastructure projects
|
157 |
+
4. Poor quality services in health and
|
158 |
+
education
|
159 |
+
� High repetition and dropout rates from school
|
160 |
+
1. Baselines and Assumptions
|
161 |
+
� Poor quality of growth and slow
|
162 |
+
structural change
|
163 |
+
� Excessive aid and loan
|
164 |
+
dependence for financing
|
165 |
+
infrastructural and construction
|
166 |
+
investments
|
167 |
+
� Limited success in expanding
|
168 |
+
manufacturing and modern
|
169 |
+
agriculture which have high job
|
170 |
+
creation potentials
|
171 |
+
� Weak institutional capacity as
|
172 |
+
the main culprit of all failures
|
173 |
+
? Provision of quality services
|
174 |
+
(electricity, water, telephone,
|
175 |
+
internet)
|
176 |
+
? Creation of enough jobs and
|
177 |
+
improved living standards
|
178 |
+
? Generation of reliable foreign
|
179 |
+
exchange revenue and debtsustainable
|
180 |
+
national economic
|
181 |
+
capacity
|
182 |
+
? Completion of development
|
183 |
+
projects and investment plans
|
184 |
+
under public-private
|
185 |
+
partnerships
|
186 |
+
� Low reward for merit, productivity and effort
|
187 |
+
while low disincentive for laziness, wastefulness
|
188 |
+
and corruption
|
189 |
+
� Slow institutional change and transformation in:
|
190 |
+
? Government policies
|
191 |
+
? Investor attitude
|
192 |
+
? Youth behaviour
|
193 |
+
? Role of the intellectuals
|
194 |
+
� The need for sustained increase in production
|
195 |
+
and productivity
|
196 |
+
� The need to set a common national vision to
|
197 |
+
achieve major successes with consensus and
|
198 |
+
popular legitimacy
|
199 |
+
Major areas of failure in the economy
|
200 |
+
1. Baselines and Assumptions
|
201 |
+
� Poor quality of growth and slow
|
202 |
+
structural change
|
203 |
+
� Excessive aid and loan
|
204 |
+
dependence for financing
|
205 |
+
infrastructural and construction
|
206 |
+
investments
|
207 |
+
� Limited success in expanding
|
208 |
+
manufacturing and modern
|
209 |
+
agriculture which have high job
|
210 |
+
creation potentials
|
211 |
+
� Weak institutional capacity as
|
212 |
+
the main culprit of all failures
|
213 |
+
? Provision of quality services
|
214 |
+
(electricity, water, telephone,
|
215 |
+
internet)
|
216 |
+
? Creation of enough jobs and
|
217 |
+
improved living standards
|
218 |
+
? Generation of reliable foreign
|
219 |
+
exchange revenue and debtsustainable
|
220 |
+
national economic
|
221 |
+
capacity
|
222 |
+
? Completion of development
|
223 |
+
projects and investment plans
|
224 |
+
under public-private
|
225 |
+
partnerships
|
226 |
+
� Low reward for merit, productivity and effort
|
227 |
+
while low disincentive for laziness, wastefulness
|
228 |
+
and corruption
|
229 |
+
� Slow institutional change and transformation in:
|
230 |
+
? Government policies
|
231 |
+
? Investor attitude
|
232 |
+
? Youth behaviour
|
233 |
+
? Role of the intellectuals
|
234 |
+
� The need for sustained increase in production
|
235 |
+
and productivity
|
236 |
+
� The need to set a common national vision to
|
237 |
+
achieve major successes with consensus and
|
238 |
+
popular legitimacy
|
239 |
+
Major areas of failure in the economy
|
240 |
+
2. Departures
|
241 |
+
1. Emphasis on quality of economic growth
|
242 |
+
2. Participation and coordination of sectors in the planning process
|
243 |
+
3. Sectoral linkages and multi-sectoral development focus
|
244 |
+
4. Preparation of national development corridors based on development potentials
|
245 |
+
5. Focus on solving institutional bottlenecks
|
246 |
+
6. The ongoing home grown economic reform programme as a sprinting board
|
247 |
+
7. Emphasis on resilience building, innovation and entrepreneurship
|
248 |
+
3. Strategic pillars
|
249 |
+
1. Ensure quality growth
|
250 |
+
2. Improve productivity and competitiveness
|
251 |
+
3. Undertake institutional transformation
|
252 |
+
4. Ensure private sector's leadership in the economy
|
253 |
+
5. Ensure equitable participation of women and children
|
254 |
+
6. Build climate resilient green economy
|
255 |
+
3. Strategic pillars
|
256 |
+
� Increasing export revenues and substituting imports by
|
257 |
+
reducing production costs
|
258 |
+
� Availing quality and massive infrastructure
|
259 |
+
? Linking infrastructural development with development corridors
|
260 |
+
� Producing required human resources with quality
|
261 |
+
� Producing enough and quality human resources
|
262 |
+
� Prioritizing innovative production systems
|
263 |
+
� Linking incentives with export revenue and job creation
|
264 |
+
performances
|
265 |
+
� Modernizing and enhancing the logistic system
|
266 |
+
� Creating technological competences needed for longterm
|
267 |
+
growth
|
268 |
+
� The economic growth should ensure:
|
269 |
+
? Participation of all citizens and equitable utilization of the
|
270 |
+
growth proceeds
|
271 |
+
? Improved standard of living of every citizen
|
272 |
+
? Reduced poverty in all indicators
|
273 |
+
? Reduced inflation and unemployment
|
274 |
+
� The economic growth should lead to increased
|
275 |
+
aggregate supply
|
276 |
+
� Focus on modern agriculture, manufacturing and
|
277 |
+
mining
|
278 |
+
� Emphasis on exploiting the sources of growth through
|
279 |
+
structural change
|
280 |
+
1.Ensuring quality economic growth 2. Raising production and productivity
|
281 |
+
3. Strategic pillars
|
282 |
+
� Build democratic and judicial institutions that ensure elite bargain,
|
283 |
+
national consensus, common vision and government legitimacy
|
284 |
+
� Build private sector and competition friendly bureaucracy
|
285 |
+
� Coordinate with parents, the society and teachers to make
|
286 |
+
educational institutions centers of excellence and virtuous citizens
|
287 |
+
� Coordinate with parents as well as social and religious leaders to
|
288 |
+
encourage religious institutions and their teachings contribute
|
289 |
+
towards poverty reduction efforts
|
290 |
+
� Prepare policies, strategies and legal frameworks for achieving
|
291 |
+
prosperity
|
292 |
+
� Increased focus on innovation and research
|
293 |
+
� Creating strong social security system
|
294 |
+
3. Institutional Transformation 4. Private sector's leadership in the economy
|
295 |
+
� Create conducive investment climate and incentivize
|
296 |
+
domestic investors in key sectors
|
297 |
+
� Build strong and market-led public-private partnerships in
|
298 |
+
order to ensure the establishment of inclusive and
|
299 |
+
pragmatic market economy
|
300 |
+
� Enhance access and quality of infrastructure to attract
|
301 |
+
quality foreign direct investment
|
302 |
+
� Identify new sources of growth, empower and stimulate
|
303 |
+
the private sector, and supplement the private sector in
|
304 |
+
strategic areas
|
305 |
+
� Emphasis for public-private partnership on problem
|
306 |
+
solving innovations and research activities
|
307 |
+
3. Strategic pillars
|
308 |
+
� Ensure gender equity in economic and social
|
309 |
+
sectors
|
310 |
+
? Participation of women at all levels of education
|
311 |
+
? Asset ownership of women
|
312 |
+
� Ensure fair participation of women and youth in
|
313 |
+
leadership and decision making positions
|
314 |
+
� Create awareness among citizens about the role of
|
315 |
+
women and youth in the country�s overall
|
316 |
+
development
|
317 |
+
� Increase basin development efforts to fight land
|
318 |
+
degradation and to reduce pollutions
|
319 |
+
� Improve productivity and reduce GHG emissions
|
320 |
+
� Increase forest protection and development
|
321 |
+
� Increase production of electricity from renewable
|
322 |
+
sources for domestic use and for export
|
323 |
+
� Focus on modern and energy saving technologies
|
324 |
+
5. Equitable participation of women and children 6. Climate resilient green economy
|
325 |
+
4. Macroeconomic Goals
|
326 |
+
Assumptions
|
327 |
+
? Requirement to significantly reduce
|
328 |
+
poverty
|
329 |
+
? Available national potentials
|
330 |
+
? Potential for investment in the economy
|
331 |
+
? Existing potentials in each sector
|
332 |
+
? Low productivity that needs to be
|
333 |
+
improved
|
334 |
+
� Make Ethiopia a middle income
|
335 |
+
economy by 2022
|
336 |
+
� Raise per capita income to USD 1,115
|
337 |
+
in 2022
|
338 |
+
? Threshold for middle-income is USD 1,026
|
339 |
+
? Plus human development index and
|
340 |
+
economic vulnerability index
|
341 |
+
� Raise per capita income to USD 2,220
|
342 |
+
by 2030
|
343 |
+
Sectoral growth Targets (2021-2030)
|
344 |
+
Assured middle- income potential
|
345 |
+
10.2%
|
346 |
+
Average
|
347 |
+
Growth
|
348 |
+
Target
|
349 |
+
Percentage of population below poverty line
|
350 |
+
4. Macroeconomic Goals
|
351 |
+
Structural change
|
352 |
+
Financing Gaps
|
353 |
+
Reduce urban unemployment to less than 9%
|
354 |
+
?1.36 million new jobs need to be
|
355 |
+
created per annum
|
356 |
+
Sectoral composition of GDP Labour force participation
|
357 |
+
Economic
|
358 |
+
Sectors
|
359 |
+
Performance Target
|
360 |
+
2011 2015 2018/19 2030
|
361 |
+
Agriculture 45 39.7 32.8 22.0
|
362 |
+
Industry 15.1 21.2 27.6 35.9
|
363 |
+
Manufacturing 4.7 5.5 6.8 17.2
|
364 |
+
Services 39.9 39 39.4 42.1
|
365 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
366 |
+
� GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
|
367 |
+
and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
|
368 |
+
� If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
|
369 |
+
� Returning the economy to its high growth trajectory requires focusing on sectors with high
|
370 |
+
productivity and job creation potentials
|
371 |
+
� Public investment should focus on empowering the private sector
|
372 |
+
� Promoting both domestic and foreign investments with the right set of incentives (merit based)
|
373 |
+
� Modernizing production systems and improving uptake of technology
|
374 |
+
� Conducting demand analysis for export commodities to remedy for the declining trend in exports
|
375 |
+
and foreign exchange earnings.
|
376 |
+
6. Potentials
|
377 |
+
� Endowment of various natural resources contributing to the growth potential
|
378 |
+
� Huge unutilized arable land creates great potential for the success of the plan
|
379 |
+
� Endowment of gemstones, ornamental, energy, metals, and metallic minerals
|
380 |
+
� Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
|
381 |
+
Natural
|
382 |
+
Resources
|
383 |
+
� Large youth population and potential for demographic dividend
|
384 |
+
� Cumulative capacity in education and health
|
385 |
+
� Positive attitude and noble culture of reaching agreement among citizens
|
386 |
+
Human
|
387 |
+
capital
|
388 |
+
6. Potentials
|
389 |
+
Built physical and material capitals
|
390 |
+
?Transport and communication
|
391 |
+
? Irrigation infrastructures for modern agriculture
|
392 |
+
?Industrial Parks
|
393 |
+
?Mega energy infrastructures
|
394 |
+
Physical
|
395 |
+
capital
|
396 |
+
Unexploited
|
397 |
+
growth
|
398 |
+
potentials
|
399 |
+
� Utilizing the tourism potential through modernization
|
400 |
+
� Using the mining subsector as a source of input as well as a competitive industry in its
|
401 |
+
own right
|
402 |
+
6. Potentials
|
403 |
+
� Solving supply side bottlenecks to satisfy the existing demand
|
404 |
+
� Improving international acceptance and reliable partnerships
|
405 |
+
? The �medemer�/synergy philosophy
|
406 |
+
? The ongoing political reform measures
|
407 |
+
? The Homegrown Economic Reform programme
|
408 |
+
� Increased finance from partners and multilateral institutions
|
409 |
+
? Increased availability of foreign exchange
|
410 |
+
? Reduced debt stress for the short to medium term
|
411 |
+
? Increased potential for development
|
412 |
+
Increased
|
413 |
+
demand as
|
414 |
+
potential
|
415 |
+
Political Capital
|
416 |
+
Continental
|
417 |
+
and regional
|
418 |
+
integrations
|
419 |
+
� Regional and continental economic integration agreements
|
420 |
+
� International and continental free trade agreements
|
421 |
+
6. Potentials
|
422 |
+
Low
|
423 |
+
technology as
|
424 |
+
a potential
|
425 |
+
� Undeniably low status of technological development
|
426 |
+
� International mobility and spillover effect of technology
|
427 |
+
� Potential for development and catching up by filling the technological gaps
|
428 |
+
� Doubling crop productivity from the current 24-36 quintals per hectare will result
|
429 |
+
in 7% increase in crop production
|
430 |
+
� Raise the production efficiency of manufacturing from the current 50% to 80%
|
431 |
+
7. Focus Areas
|
432 |
+
7.1. Productive sectors: agriculture, manufacturing, mining
|
433 |
+
7.2. Service sector: tourism
|
434 |
+
7.3. Enabling sectors: energy, transport, sustainable finance,
|
435 |
+
innovation and technology, urban development, irrigation,
|
436 |
+
human capital development
|
437 |
+
7.1. Productive sectors
|
438 |
+
Agriculture Objectives
|
439 |
+
1. Free agriculture from rain dependence
|
440 |
+
2. Agricultural mechanization services
|
441 |
+
3. Contract farming, cluster approach and
|
442 |
+
land consolidation
|
443 |
+
4. Livestock, animal feed and animal health
|
444 |
+
5. Horticulture (irrigation and urban farming)
|
445 |
+
6. Private sector participation
|
446 |
+
7. Institutional implementation capacity
|
447 |
+
8. Climate resilient sustainable agricultural
|
448 |
+
development
|
449 |
+
1. Improve income and livelihood options for farming and pastoral
|
450 |
+
communities through increased productivity and competitiveness
|
451 |
+
2. Modernize agriculture and ensure national food and nutrition security
|
452 |
+
3. Raise export of agricultural output and substitute imports
|
453 |
+
4. Make agriculture a viable and profitable enterprise through value addition
|
454 |
+
5. Create rural employment opportunities
|
455 |
+
6. Enhance livestock health access and quality
|
456 |
+
7. Preserve animal genetic resources and increase pastoral research
|
457 |
+
8. Improve the development of animal feed and access to markets
|
458 |
+
9. Develop livestock specific extension package for each livestock type
|
459 |
+
Focus Areas
|
460 |
+
7.1. Productive sector
|
461 |
+
Manufacturing Industry
|
462 |
+
Objectives
|
463 |
+
1. Production of quality and competitive food, textile, housing and
|
464 |
+
pharmaceutical products for export and domestic markets
|
465 |
+
2. Production and productivity of existing manufacturing industries
|
466 |
+
3. Utilization of locally available inputs
|
467 |
+
4. Value chains, linkages and interdependencies
|
468 |
+
5. Linkages between large scale metallurgical and engineering,
|
469 |
+
chemical and pharmaceutical industries with other industries
|
470 |
+
6. Job creation, cluster approaches and expanding small and medium
|
471 |
+
scale manufacturing
|
472 |
+
7. Private sector participation and partnership
|
473 |
+
1. Establish basis for domestic industrialization
|
474 |
+
2. Value addition through enhanced inter-sectoral
|
475 |
+
linkages
|
476 |
+
3. Enhance productivity through private sector
|
477 |
+
leadership and supportive role of the
|
478 |
+
government
|
479 |
+
? Create job opportunities for the youth leaving
|
480 |
+
agriculture and concentrating in urban areas
|
481 |
+
? Make exportable commodities internationally
|
482 |
+
competitive
|
483 |
+
? Ensure structural change
|
484 |
+
Focus areas
|
485 |
+
7.1. Productive sectors
|
486 |
+
Mining
|
487 |
+
Objectives
|
488 |
+
� Foreign exchange earning and
|
489 |
+
domestic revenues
|
490 |
+
� Increased investment in mining
|
491 |
+
� Participation of manufacturing
|
492 |
+
industries that add value
|
493 |
+
� Job creation
|
494 |
+
� Add value for improved contribution of the subsector
|
495 |
+
� Increase inter-sectoral linkages to raise raw material inputs to other
|
496 |
+
sectors
|
497 |
+
� Make mining a competent subsector and induce structural change
|
498 |
+
� Increase human resource and technological capabilities through
|
499 |
+
research and trainings
|
500 |
+
� Raise foreign exchange revenue from mining through increased
|
501 |
+
exploration and production
|
502 |
+
� Improve traditional mining production and marketing systems
|
503 |
+
� Improve the country�s geological information
|
504 |
+
Focus areas
|
505 |
+
7.2. Service sector
|
506 |
+
Tourism
|
507 |
+
Objectives
|
508 |
+
� Identification and developing destinations
|
509 |
+
� Infrastructure
|
510 |
+
� Competitiveness
|
511 |
+
?improve existing destinations
|
512 |
+
?develop new destinations
|
513 |
+
? diversify service and raise quality
|
514 |
+
� Market linkages, branding, and promotion
|
515 |
+
� Technology, research and development
|
516 |
+
� Preservation, maintenance and proper
|
517 |
+
utilization of heritage resources
|
518 |
+
� Expand job opportunities
|
519 |
+
� Raise incomes
|
520 |
+
� Build information management
|
521 |
+
systems
|
522 |
+
� Increase implementation capacity
|
523 |
+
Focus areas
|
524 |
+
7.3. Enabling sectors
|
525 |
+
Urban development
|
526 |
+
Objectives
|
527 |
+
? Prioritize productive sectors in job creation and enterprise
|
528 |
+
development plans
|
529 |
+
? Rapid development and equity goals in land provision system
|
530 |
+
? Participation of indigenous people in land redevelopment and
|
531 |
+
expansion
|
532 |
+
? Urban land registration and cadaster system, modern
|
533 |
+
property valuation
|
534 |
+
? Greenery and public spaces as well as waste disposal and
|
535 |
+
management in urban planning and implementation
|
536 |
+
? Housing development and financing options to reduce
|
537 |
+
housing shortages
|
538 |
+
? Integrated infrastructure and services provision
|
539 |
+
? Role of private sector in infrastructure development and
|
540 |
+
service provision
|
541 |
+
� Expand micro and small-scale
|
542 |
+
enterprises to reduce urban
|
543 |
+
unemployment
|
544 |
+
� Develop and avail urban land based on
|
545 |
+
demand, equity and cost effectiveness
|
546 |
+
� Make quality housing accessible both in
|
547 |
+
rural and urban areas
|
548 |
+
� Develop quality and integrated
|
549 |
+
infrastructure as well as service
|
550 |
+
provision in towns
|
551 |
+
� Improve financial management and
|
552 |
+
resource utilization in urban areas
|
553 |
+
Focus areas
|
554 |
+
7.3. Enabling sectors
|
555 |
+
Innovation and Technology
|
556 |
+
Objectives
|
557 |
+
? Access to innovation and
|
558 |
+
technological information
|
559 |
+
? Developing a digital economy
|
560 |
+
? Productivity enhancement and
|
561 |
+
competitiveness
|
562 |
+
? Build a digital economy
|
563 |
+
? Develop national scientific research and technological
|
564 |
+
capabilities
|
565 |
+
? Support problem solving research and development of
|
566 |
+
technologies necessary for raising production,
|
567 |
+
productivity and service provision
|
568 |
+
? Create jobs and capital that are based on technology
|
569 |
+
? Develop technological and data security protection
|
570 |
+
systems
|
571 |
+
Focus areas
|
572 |
+
7.3. Enabling sectors
|
573 |
+
Sustainable finance
|
574 |
+
Objectives
|
575 |
+
� Access to modern finance and saving culture in rural
|
576 |
+
areas
|
577 |
+
� Support to the private sector and corporations to
|
578 |
+
reinvest profits in productive sectors
|
579 |
+
� Role of private financial institutions in manufacturing
|
580 |
+
and agriculture
|
581 |
+
� Digital revenue collection system
|
582 |
+
� Tax equity (contraband, tax evasion, and bringing the
|
583 |
+
underground economy to the tax system)
|
584 |
+
� Domestic and foreign strategic partnerships
|
585 |
+
� Transform financing from short term to long-term,
|
586 |
+
sustainable and quality sources
|
587 |
+
� Ensure financing quality based on sectoral prioritization
|
588 |
+
and reduction of wastage
|
589 |
+
� Increase the number of domestic saving institutions both
|
590 |
+
in rural and urban areas
|
591 |
+
� Support domestic finance with foreign exchange capacity
|
592 |
+
and foreign direct investment
|
593 |
+
� Modernize domestic revenue collection system
|
594 |
+
� Raise voluntary tax payment attitude
|
595 |
+
� Bring the informal sector to the formal tax system
|
596 |
+
Focus areas
|
597 |
+
7.3. Enabling sectors
|
598 |
+
Transport
|
599 |
+
Objectives
|
600 |
+
� Access to infrastructure
|
601 |
+
� Implementation capacity
|
602 |
+
� Participation of the private sector and the general
|
603 |
+
public
|
604 |
+
� Financing capacity
|
605 |
+
� Ensure equitable access to transport infrastructure and
|
606 |
+
services
|
607 |
+
� Improve transport safety
|
608 |
+
� Make logistics services fast and reliable
|
609 |
+
� Build transport infrastructure and service that is
|
610 |
+
resilient to climate change
|
611 |
+
Focus areas
|
612 |
+
7.3. Enabling sectors
|
613 |
+
Energy
|
614 |
+
Objectives
|
615 |
+
? Equity in access to electricity services
|
616 |
+
? Energy access and quality
|
617 |
+
? Alternative sources of energy
|
618 |
+
? Reliability of electricity infrastructure
|
619 |
+
? Investment and income in energy subsector
|
620 |
+
� Ensure equitable access to transport
|
621 |
+
infrastructure and services
|
622 |
+
� Improve transport safety
|
623 |
+
� Make logistics services fast and reliable
|
624 |
+
� Build transport infrastructure and service that is
|
625 |
+
resilient to climate change
|
626 |
+
Focus areas
|
627 |
+
7.3. Enabling sectors
|
628 |
+
Irrigation
|
629 |
+
Objectives
|
630 |
+
? Medium and large scale irrigation infrastructure
|
631 |
+
? Job creation
|
632 |
+
? Share of government expenditure and alternative
|
633 |
+
financing options
|
634 |
+
? Institutional capacity and human resource
|
635 |
+
development
|
636 |
+
? Improve agricultural output and productivity
|
637 |
+
? Reduce government spending and enhance
|
638 |
+
institutional capacity and human resources
|
639 |
+
development
|
640 |
+
? Ensure the inclusion of all genders and
|
641 |
+
disabled citizens
|
642 |
+
? Develop alternative financing options for
|
643 |
+
irrigation development
|
644 |
+
Focus areas
|
645 |
+
7.3. Enabling sectors
|
646 |
+
Human capital development
|
647 |
+
Objectives
|
648 |
+
� Make education and training inclusive and equitable by
|
649 |
+
harmonizing the system with ability, need and capacity
|
650 |
+
� Develop capacity of educational institutions (teacher capacity,
|
651 |
+
inputs and technology)
|
652 |
+
� Establish education and training quality assurance system
|
653 |
+
� Avail free and compulsory education for pre-primary to junior
|
654 |
+
secondary levels and free education at the senior secondary levels
|
655 |
+
equitably
|
656 |
+
� Ensure the relevance of education and training system and
|
657 |
+
synchronize education policy with economic and social
|
658 |
+
development needs
|
659 |
+
� Make the education and training policy compatible with the
|
660 |
+
nation�s contemporary capacities as well as global and regional
|
661 |
+
market opportunities
|
662 |
+
� Enhance commitment, capability and responsibility of citizens
|
663 |
+
? Ensure equitable and quality health services
|
664 |
+
? Raise average life expectancy
|
665 |
+
? Achieve universal health coverage through
|
666 |
+
proactive and prevention health system
|
667 |
+
? Curtail preventable maternal and child deaths
|
668 |
+
? Reduce incidences of contagious and noncontagious
|
669 |
+
related diseases and deaths
|
670 |
+
? Build capacity for health tourism through
|
671 |
+
increased treatment capabilities
|
672 |
+
? Create a healthy society that is free from
|
673 |
+
addictions and use technology for supporting
|
674 |
+
knowledge led economic development
|
675 |
+
Focus areas
|
676 |
+
8 Nationally, regionally and locally balanced and competitive development
|
677 |
+
1. Lack of synchronization of investment with
|
678 |
+
resource potentials and development needs
|
679 |
+
2. Poor alignment of federal, regional and
|
680 |
+
district level investment plans with the
|
681 |
+
national development goals and envisioned
|
682 |
+
settlement patterns
|
683 |
+
3. Poor regional coordination due to low
|
684 |
+
consideration for trans-regional and
|
685 |
+
spatial issues in development plans of
|
686 |
+
regional states
|
687 |
+
4. Inter-regional and intra-regional
|
688 |
+
disparities in infrastructural development
|
689 |
+
and access to services
|
690 |
+
Challenges
|
691 |
+
8. Nationally, regionally and locally balanced and competitive development
|
692 |
+
1. Ensure that the investment flow and
|
693 |
+
infrastructural development plans fairly go hand in
|
694 |
+
hand with resource potential and development
|
695 |
+
needs
|
696 |
+
?Developing underutilized natural resources
|
697 |
+
?Equitable distribution and access to
|
698 |
+
infrastructure
|
699 |
+
?Sustainable environmental protection
|
700 |
+
2. Ensure the inclusion of pastoral and agro-pastoral
|
701 |
+
areas in the development
|
702 |
+
?Focused infrastructural development in pastoral
|
703 |
+
areas such as education and health sector input
|
704 |
+
provision as well as governance
|
705 |
+
?Market linkages with other areas and the central
|
706 |
+
markets
|
707 |
+
?Improve rural finance (credit and insurance) to
|
708 |
+
encourage fattening, milk processing, leather
|
709 |
+
production and irrigation agriculture
|
710 |
+
Focus areas
|
711 |
+
9. Monitoring and Evaluation
|
712 |
+
10 Years Perspective
|
713 |
+
Plan KPIs
|
714 |
+
Federal Implementing
|
715 |
+
Institutions
|
716 |
+
Planning and
|
717 |
+
Development Commission
|
718 |
+
Generate Data (Census,
|
719 |
+
Sample and administrative
|
720 |
+
data)
|
721 |
+
Annual Reports
|
722 |
+
Dialogue forums
|
723 |
+
(Civic Organizations, professional
|
724 |
+
associations, development partners,
|
725 |
+
intellectuals)
|
726 |
+
Central Statistical Agency
|
727 |
+
Database
|
728 |
+
National
|
729 |
+
Information Portal
|
730 |
+
National Statistics
|
731 |
+
Development Strategic
|
732 |
+
plan
|
733 |
+
Evaluation Reports
|
734 |
+
Prime Minister�s Office
|
735 |
+
House of People�s
|
736 |
+
Representatives
|
737 |
+
Thank you!
|
sample/South Africa_s Low Emission Development Strategy.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
style.css
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.row-widget.stTextInput > div:first-of-type {
|
3 |
+
background: #fff;
|
4 |
+
display: flex;
|
5 |
+
border: 1px solid #dfe1e5;
|
6 |
+
box-shadow: none;
|
7 |
+
border-radius: 24px;
|
8 |
+
height: 50px;
|
9 |
+
width: auto;
|
10 |
+
margin: 10px auto 30px;
|
11 |
+
}
|
12 |
+
|
13 |
+
.row-widget.stTextInput > div:first-of-type:hover,
|
14 |
+
.row-widget.stTextInput > div:first-of-type:focus {
|
15 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
16 |
+
}
|
17 |
+
|
18 |
+
.row-widget.stTextInput .st-bq {
|
19 |
+
background-color: #fff;
|
20 |
+
}
|
21 |
+
|
22 |
+
.row-widget.stTextInput > label {
|
23 |
+
color: #b3b3b3;
|
24 |
+
}
|
25 |
+
|
26 |
+
.row-widget.stButton > button {
|
27 |
+
border-radius: 24px;
|
28 |
+
background-color: #B6C9B1;
|
29 |
+
color: #fff;
|
30 |
+
border: none;
|
31 |
+
padding: 6px 20px;
|
32 |
+
float: right;
|
33 |
+
background-image: none;
|
34 |
+
}
|
35 |
+
|
36 |
+
.row-widget.stButton > button:hover {
|
37 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
38 |
+
}
|
39 |
+
|
40 |
+
.row-widget.stButton > button:focus {
|
41 |
+
border: none;
|
42 |
+
color: #fff;
|
43 |
+
}
|
44 |
+
|
45 |
+
.footer-custom {
|
46 |
+
position: fixed;
|
47 |
+
bottom: 0;
|
48 |
+
width: 100%;
|
49 |
+
color: var(--text-color);
|
50 |
+
max-width: 698px;
|
51 |
+
font-size: 14px;
|
52 |
+
height: 50px;
|
53 |
+
padding: 10px 0;
|
54 |
+
z-index: 50;
|
55 |
+
}
|
56 |
+
|
57 |
+
.main {
|
58 |
+
padding: 20px;
|
59 |
+
}
|
60 |
+
|
61 |
+
footer {
|
62 |
+
display: none !important;
|
63 |
+
}
|
64 |
+
|
65 |
+
.footer-custom a {
|
66 |
+
color: var(--text-color);
|
67 |
+
}
|
68 |
+
|
69 |
+
#wikipedia-assistant {
|
70 |
+
font-size: 36px;
|
71 |
+
}
|
72 |
+
|
73 |
+
.generated-answer p {
|
74 |
+
font-size: 16px;
|
75 |
+
font-weight: bold;
|
76 |
+
}
|
77 |
+
|
78 |
+
.react-json-view {
|
79 |
+
margin: 40px 0 80px;
|
80 |
+
}
|
81 |
+
|
82 |
+
.tooltip {
|
83 |
+
text-align: center;
|
84 |
+
line-height: 20px;
|
85 |
+
display: table-caption;
|
86 |
+
font-size: 10px;
|
87 |
+
border-radius: 50%;
|
88 |
+
height: 20px;
|
89 |
+
width: 20px;
|
90 |
+
position: relative;
|
91 |
+
cursor: pointer;
|
92 |
+
color:#000;
|
93 |
+
}
|
94 |
+
|
95 |
+
.tooltip .tooltiptext {
|
96 |
+
visibility: hidden;
|
97 |
+
width: 280px;
|
98 |
+
text-align: center;
|
99 |
+
border-radius: 6px;
|
100 |
+
padding: 10px;
|
101 |
+
position: absolute;
|
102 |
+
z-index: 1;
|
103 |
+
top: 25px;
|
104 |
+
left: 50%;
|
105 |
+
margin-left: -140px;
|
106 |
+
font-size: 14px;
|
107 |
+
background-color: #fff;
|
108 |
+
border: 1px solid #ccc;
|
109 |
+
box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
|
110 |
+
color: #000;
|
111 |
+
}
|
112 |
+
|
113 |
+
.tooltip:hover .tooltiptext {
|
114 |
+
visibility: visible;
|
115 |
+
}
|
116 |
+
|
117 |
+
.sentence-wrapper {
|
118 |
+
border-left: 4px solid #ffc423;
|
119 |
+
padding-left: 20px;
|
120 |
+
margin-bottom: 40px;
|
121 |
+
}
|
122 |
+
|
123 |
+
#context {
|
124 |
+
padding: 2rem 0 1rem;
|
125 |
+
}
|
126 |
+
|
127 |
+
hr {
|
128 |
+
margin: 2em 0 1em;
|
129 |
+
}
|
130 |
+
|
131 |
+
.technical-details-info {
|
132 |
+
margin-bottom: 100px;
|
133 |
+
}
|
134 |
+
|
135 |
+
.loader-wrapper {
|
136 |
+
display: flex;
|
137 |
+
align-items: center;
|
138 |
+
background-color: rgba(250, 202, 43, 0.2);
|
139 |
+
padding: 15px 20px;
|
140 |
+
border-radius: 6px;
|
141 |
+
}
|
142 |
+
|
143 |
+
.loader-wrapper p {
|
144 |
+
margin-bottom: 0;
|
145 |
+
margin-left: 20px;
|
146 |
+
}
|
147 |
+
|
148 |
+
.loader {
|
149 |
+
width: 30px;
|
150 |
+
height: 30px;
|
151 |
+
border: dotted 5px #868686;
|
152 |
+
border-radius: 100%;
|
153 |
+
animation: spin 1s linear infinite;
|
154 |
+
}
|
155 |
+
|
156 |
+
.loader-note {
|
157 |
+
font-size: 14px;
|
158 |
+
color: #b3b3b3;
|
159 |
+
margin-left: 5px;
|
160 |
+
}
|
161 |
+
|
162 |
+
@keyframes spin {
|
163 |
+
0% {
|
164 |
+
transform: rotate(0deg) scale(0.8);
|
165 |
+
border-top-color: transparent;
|
166 |
+
border-right-color: transparent;
|
167 |
+
}
|
168 |
+
50% { transform: rotate(180deg) scale(1.2);
|
169 |
+
border-color: #949494;
|
170 |
+
border-top-color: transparent;
|
171 |
+
border-right-color: transparent;
|
172 |
+
}
|
173 |
+
100% { transform: rotate(360deg) scale(0.8);
|
174 |
+
border-color: #bbbbbb;
|
175 |
+
border-top-color: transparent;
|
176 |
+
border-right-color: transparent;
|
177 |
+
}
|
178 |
+
}
|
179 |
+
|
udfPreprocess/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# adding for package implementation
|
udfPreprocess/cleaning.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import string
|
4 |
+
import nltk
|
5 |
+
import spacy
|
6 |
+
import en_core_web_sm
|
7 |
+
import re
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
from haystack.nodes import PreProcessor
|
11 |
+
|
12 |
+
'''basic cleaning - suitable for transformer models'''
|
13 |
+
def basic(s):
|
14 |
+
"""
|
15 |
+
:param s: string to be processed
|
16 |
+
:return: processed string: see comments in the source code for more info
|
17 |
+
"""
|
18 |
+
# Text Lowercase
|
19 |
+
#s = s.lower()
|
20 |
+
# Remove punctuation
|
21 |
+
#translator = str.maketrans(' ', ' ', string.punctuation)
|
22 |
+
#s = s.translate(translator)
|
23 |
+
# Remove URLs
|
24 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
25 |
+
s = re.sub(r"http\S+", " ", s)
|
26 |
+
# Remove new line characters
|
27 |
+
#s = re.sub('\n', ' ', s)
|
28 |
+
|
29 |
+
# Remove distracting single quotes
|
30 |
+
#s = re.sub("\'", " ", s)
|
31 |
+
# Remove all remaining numbers and non alphanumeric characters
|
32 |
+
#s = re.sub(r'\d+', ' ', s)
|
33 |
+
#s = re.sub(r'\W+', ' ', s)
|
34 |
+
|
35 |
+
# define custom words to replace:
|
36 |
+
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
|
37 |
+
|
38 |
+
return s.strip()
|
39 |
+
|
40 |
+
|
41 |
+
def preprocessingForSDG(document):
|
42 |
+
|
43 |
+
"""
|
44 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
45 |
+
|
46 |
+
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
47 |
+
list that contains all text joined together.
|
48 |
+
"""
|
49 |
+
|
50 |
+
preprocessor = PreProcessor(
|
51 |
+
clean_empty_lines=True,
|
52 |
+
clean_whitespace=True,
|
53 |
+
clean_header_footer=True,
|
54 |
+
split_by="word",
|
55 |
+
split_length=120,
|
56 |
+
split_respect_sentence_boundary=False,
|
57 |
+
#split_overlap=1
|
58 |
+
)
|
59 |
+
for i in document:
|
60 |
+
docs_processed = preprocessor.process([i])
|
61 |
+
for item in docs_processed:
|
62 |
+
item.content = basic(item.content)
|
63 |
+
|
64 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
65 |
+
|
66 |
+
# create dataframe of text and list of all text
|
67 |
+
df = pd.DataFrame(docs_processed)
|
68 |
+
all_text = " ".join(df.content.to_list())
|
69 |
+
par_list = df.content.to_list()
|
70 |
+
|
71 |
+
return docs_processed, df, all_text, par_list
|
72 |
+
|
73 |
+
def preprocessing(document):
|
74 |
+
|
75 |
+
"""
|
76 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
77 |
+
|
78 |
+
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
79 |
+
list that contains all text joined together.
|
80 |
+
"""
|
81 |
+
|
82 |
+
preprocessor = PreProcessor(
|
83 |
+
clean_empty_lines=True,
|
84 |
+
clean_whitespace=True,
|
85 |
+
clean_header_footer=True,
|
86 |
+
split_by="sentence",
|
87 |
+
split_length=3,
|
88 |
+
split_respect_sentence_boundary=False,
|
89 |
+
split_overlap=1
|
90 |
+
)
|
91 |
+
for i in document:
|
92 |
+
docs_processed = preprocessor.process([i])
|
93 |
+
for item in docs_processed:
|
94 |
+
item.content = basic(item.content)
|
95 |
+
|
96 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
97 |
+
|
98 |
+
# create dataframe of text and list of all text
|
99 |
+
df = pd.DataFrame(docs_processed)
|
100 |
+
all_text = " ".join(df.content.to_list())
|
101 |
+
par_list = df.content.to_list()
|
102 |
+
|
103 |
+
return docs_processed, df, all_text, par_list
|
104 |
+
|
105 |
+
'''processing with spacy - suitable for models such as tf-idf, word2vec'''
|
106 |
+
def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
|
107 |
+
|
108 |
+
"""
|
109 |
+
|
110 |
+
Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
|
111 |
+
|
112 |
+
filters out all but proper nouns, nounts, verbs and adjectives.
|
113 |
+
|
114 |
+
Parameters
|
115 |
+
----------
|
116 |
+
alpha : str
|
117 |
+
|
118 |
+
The input string.
|
119 |
+
|
120 |
+
use_nlp : bool, default False
|
121 |
+
|
122 |
+
Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
|
123 |
+
|
124 |
+
Should be set to False if used inside nlp.pipeline
|
125 |
+
|
126 |
+
Returns
|
127 |
+
-------
|
128 |
+
' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
|
129 |
+
|
130 |
+
Notes
|
131 |
+
-----
|
132 |
+
Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
|
133 |
+
Use together with nlp.pipeline for batch processing.
|
134 |
+
|
135 |
+
"""
|
136 |
+
|
137 |
+
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
|
138 |
+
|
139 |
+
if use_nlp:
|
140 |
+
|
141 |
+
alpha = nlp(alpha)
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
beta = []
|
146 |
+
|
147 |
+
for tok in alpha:
|
148 |
+
|
149 |
+
if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
|
150 |
+
|
151 |
+
beta.append(tok.lemma_)
|
152 |
+
|
153 |
+
|
154 |
+
text = ' '.join(beta)
|
155 |
+
text = text.lower()
|
156 |
+
return text
|
udfPreprocess/docPreprocessing.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, Dict, List, Optional
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
import re
|
5 |
+
import logging
|
6 |
+
import string
|
7 |
+
import streamlit as st
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
import os
|
11 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
12 |
+
|
13 |
+
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
|
14 |
+
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
|
15 |
+
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
|
16 |
+
from haystack.schema import Document
|
17 |
+
import pdfplumber
|
18 |
+
|
19 |
+
import pandas as pd
|
20 |
+
|
21 |
+
import tempfile
|
22 |
+
import sqlite3
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
def load_document(
|
27 |
+
file_path: str,
|
28 |
+
file_name,
|
29 |
+
encoding: Optional[str] = None,
|
30 |
+
id_hash_keys: Optional[List[str]] = None,
|
31 |
+
) -> List[Document]:
|
32 |
+
|
33 |
+
"""
|
34 |
+
takes docx, txt and pdf files as input and \
|
35 |
+
extracts text as well as the filename as metadata. \
|
36 |
+
Since haystack does not take care of all pdf files, \
|
37 |
+
pdfplumber is attached to the pipeline in case the pdf \
|
38 |
+
extraction fails via Haystack.
|
39 |
+
|
40 |
+
Returns a list of type haystack.schema.Document
|
41 |
+
"""
|
42 |
+
|
43 |
+
if file_name.endswith('.pdf'):
|
44 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
45 |
+
if file_name.endswith('.txt'):
|
46 |
+
converter = TextConverter()
|
47 |
+
if file_name.endswith('.docx'):
|
48 |
+
converter = DocxToTextConverter()
|
49 |
+
|
50 |
+
|
51 |
+
documents = []
|
52 |
+
logger.info("Converting {}".format(file_name))
|
53 |
+
# PDFToTextConverter, TextConverter, and DocxToTextConverter
|
54 |
+
# return a list containing a single Document
|
55 |
+
document = converter.convert(
|
56 |
+
file_path=file_path, meta=None,
|
57 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
58 |
+
)[0]
|
59 |
+
text = document.content
|
60 |
+
documents.append(Document(content=text,
|
61 |
+
meta={"name": file_name},
|
62 |
+
id_hash_keys=id_hash_keys))
|
63 |
+
|
64 |
+
'''check if text is empty and apply different pdf processor. \
|
65 |
+
This can happen whith certain pdf types.'''
|
66 |
+
for i in documents:
|
67 |
+
if i.content == "":
|
68 |
+
st.write("using pdfplumber")
|
69 |
+
text = []
|
70 |
+
with pdfplumber.open(file_path) as pdf:
|
71 |
+
for page in pdf.pages:
|
72 |
+
text.append(page.extract_text())
|
73 |
+
i.content = ' '.join([page for page in text])
|
74 |
+
|
75 |
+
return documents
|