vibey commited on
Commit
77f16bc
1 Parent(s): 1d9b1cf

Upload function.py

Browse files
Files changed (1) hide show
  1. function.py +106 -0
function.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Pkgs
2
+ import streamlit as st
3
+ from transformers import pipeline
4
+ from PyPDF2 import PdfFileReader
5
+ import docx2txt
6
+ import base64
7
+ import re
8
+ import sqlite3
9
+ import time
10
+ from io import StringIO
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ time_str = time.strftime("%Y%m%d-%H%M%S")
15
+ # Loading function the model pipeline from huggingface model
16
+ @st.cache(allow_output_mutation=True)
17
+ def bart():
18
+ ''' Loading bart model using pipeline api '''
19
+ summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
20
+ return summarizer
21
+
22
+ def t5():
23
+ ''' Loading t5 model using pipeline api '''
24
+ summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")
25
+ return summarizer
26
+
27
+ # def pegasus():
28
+ # ''' Loading pegasus model using pipeline api '''
29
+ # summarizer = pipeline('summarization',model='google/pegasus-xsum')
30
+ # return summarizer
31
+
32
+ def preprocess_plain_text(x):
33
+
34
+ x = x.encode("ascii", "ignore").decode() # unicode
35
+ x = re.sub(r"https*\S+", " ", x) # url
36
+ x = re.sub(r"@\S+", " ", x) # mentions
37
+ x = re.sub(r"#\S+", " ", x) # hastags
38
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
39
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
40
+
41
+ return x
42
+
43
+ def extract_pdf(file):
44
+
45
+ '''Extract text from PDF file'''
46
+
47
+ pdfReader = PdfFileReader(file)
48
+ count = pdfReader.numPages
49
+ all_text = ""
50
+ for i in range(count):
51
+ page = pdfReader.getPage(i)
52
+ all_text += page.extractText()
53
+
54
+ return all_text
55
+
56
+
57
+ def extract_text_from_file(file):
58
+
59
+ '''Extract text from uploaded file'''
60
+
61
+ # read text file
62
+ if file.type == "text/plain":
63
+ # To convert to a string based IO:
64
+ stringio = StringIO(file.getvalue().decode("utf-8"))
65
+
66
+ # To read file as string:
67
+ file_text = stringio.read()
68
+
69
+ # read pdf file
70
+ elif file.type == "application/pdf":
71
+ file_text = extract_pdf(file)
72
+
73
+ # read docx file
74
+ elif (
75
+ file.type
76
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
77
+ ):
78
+ file_text = docx2txt.process(file)
79
+
80
+ return file_text
81
+
82
+ def summary_downloader(raw_text):
83
+
84
+ b64 = base64.b64encode(raw_text.encode()).decode()
85
+ new_filename = "new_text_file_{}_.txt".format(time_str)
86
+ st.markdown("#### Download Summary as a File ###")
87
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
88
+ st.markdown(href,unsafe_allow_html=True)
89
+
90
+
91
+ # Storage in A Database
92
+ conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
93
+ c = conn.cursor()
94
+ # Create Fxn From SQL
95
+ def create_table():
96
+ c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')
97
+
98
+
99
+ def add_data(text_to_summarize,summarized_text,postdate):
100
+ c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
101
+ conn.commit()
102
+
103
+ def view_all_data():
104
+ c.execute("SELECT * FROM TextTable")
105
+ data = c.fetchall()
106
+ return data