Spaces:

Sa-m
/

manifesto-explainer

Sleeping

App Files Files Community

Sa-m commited on Dec 27, 2021

Commit

a87bc00

1 Parent(s): b3f8933

Upload app.py

Browse files

Files changed (1) hide show

app.py +198 -0

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# -*- coding: utf-8 -*-
+"""trial _final yr proj.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1AGAk7En1Rd0RuEju4MzMxSCUVnGq73Es
+"""
+"""# MANIFESTO ANALYSIS
+## IMPORTING LIBRARIES
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# %%capture
+# !pip install tika
+# !pip install clean-text
+# !pip install gradio
+# Commented out IPython magic to ensure Python compatibility.
+import io
+import random
+import matplotlib.pyplot as plt
+import nltk
+from nltk.tokenize import word_tokenize,sent_tokenize
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from nltk.stem import WordNetLemmatizer
+from tika import parser
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+from cleantext import clean
+import nltk.corpus
+from nltk.text import Text
+from io import StringIO
+import sys
+import re
+from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
+from textblob import TextBlob
+from PIL import Image
+import gradio as gr
+from zipfile import ZipFile
+nltk.download('stopwords')
+nltk.download('punkt')
+nltk.download('wordnet')
+"""## PARSING FILES"""
+def Parsing(parsed_text):
+  parsed_text=parsed_text.name
+  raw_party =parser.from_file(parsed_text)
+  # parser.parse1(option='all',urlOrPath=parsed_text)
+  # from_buffer(parsed_text)
+  # from_file(parsed_text)
+  raw_party = raw_party['content']
+  return clean(raw_party)
+#Added more stopwords to avoid irrelevant terms
+stop_words = set(stopwords.words('english'))
+stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
+"""## PREPROCESSING"""
+def clean_text(text):
+  '''
+  Function which returns clean text
+  '''
+  text = text.encode("ascii", errors="ignore").decode("ascii")  # remove non-asciicharacters
+  text = re.sub(r"\n", " ", text)
+  text = re.sub(r"\n\n", " ", text)
+  text = re.sub(r"\t", " ", text)
+  text = re.sub(r"/ ", " ", text)
+  text = text.strip(" ")
+  text = re.sub(" +", " ", text).strip()  # get rid of multiple spaces and replace with a single
+  text = [word for word in text.split() if word not in STOPWORDS]
+  text = ' '.join(text)
+  return text
+# text_Party=clean_text(raw_party)
+def Preprocess(textParty):
+  '''
+  Removing special characters extra spaces
+  '''
+  text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
+  #Removing all stop words
+  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
+  text2Party = pattern.sub('', text1Party)
+  # fdist_cong = FreqDist(word_tokens_cong)
+  return text2Party
+# Using Concordance,you can see each time a word is used, along with its
+# immediate context. It can give you a peek into how a word is being used
+# at the sentence level and what words are used with it.
+def concordance(text_Party,strng):
+  word_tokens_party = word_tokenize(text_Party)
+  moby = Text(word_tokens_party)
+  resultList = []
+  for i in range(0,1):
+      save_stdout = sys.stdout
+      result = StringIO()
+      sys.stdout = result
+      moby.concordance(strng,lines=10,width=82)
+      sys.stdout = save_stdout
+  s=result.getvalue().splitlines()
+  return result.getvalue()
+def normalize(d, target=1.0):
+   raw = sum(d.values())
+   factor = target/raw
+   return {key:value*factor for key,value in d.items()}
+def fDistance(text2Party):
+  '''
+  most frequent words search
+  '''
+  word_tokens_party = word_tokenize(text2Party) #Tokenizing
+  fdistance = FreqDist(word_tokens_party).most_common(10)
+  mem={}
+  for x in fdistance:
+    mem[x[0]]=x[1]
+  return normalize(mem)
+def fDistancePlot(text2Party,plotN=20):
+  '''
+  most frequent words visualisation
+  '''
+  word_tokens_party = word_tokenize(text2Party) #Tokenizing
+  fdistance = FreqDist(word_tokens_party)
+  return fdistance.plot(20)
+## UI INTERFACE
+def analysis(Manifesto,Search):
+  raw_party = Parsing(Manifesto)
+  text_Party=clean_text(raw_party)
+  text_Party= Preprocess(text_Party)
+  fdist_Party=fDistance(text_Party)
+  searchRes=concordance(text_Party,Search)
+  searChRes=clean(searchRes)
+  # searChRes=searchRes.replace(Search,f"\u0332{Search}\u0332 ")
+  searChRes=searchRes.replace(Search,"\u0332".join(Search))
+  return fdist_Party,searChRes
+Search_txt=gr.inputs.Textbox()
+filePdf = gr.inputs.File()
+text = gr.outputs.Textbox(label='SEARCHED OUTPUT')
+mfw=gr.outputs.Label(label="Most Relevant topics in manifesto")
+gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[mfw,text], title='Manifesto Analysis').launch(debug=False,share=True)