Spaces:
Runtime error
Runtime error
File size: 7,020 Bytes
c0ae847 af90ec4 5f89cc0 cc0d652 2d2d28b 50d3158 7a228d7 d90f05d 4b1dcc8 5f89cc0 46193fd 5f89cc0 2d2d28b 845351c 5f89cc0 20d6d68 03ddbfd 20d6d68 99ba17a 20d6d68 1a0e07b 20d6d68 03ddbfd 20d6d68 5f89cc0 d90f05d 1918f01 20e79af 2115e8c 20d6d68 75d2ced 20d6d68 2d2d28b 8e106db 2d2d28b 20d6d68 71f1303 2a61e91 03ddbfd 20d6d68 b87bcef c7464b2 2d2d28b 20d6d68 2d2d28b b87bcef 5f89cc0 b87bcef 94d59bf f497f7f 2a61e91 20d6d68 94d59bf 5f89cc0 fb5ed70 1918f01 da1f55e 5f89cc0 da1f55e 7a228d7 ccd2173 97f7d3e 5f89cc0 8cff1a5 b3bc472 5f89cc0 eea15da b3bc472 d7485e8 d90f05d 432c28d d7485e8 c0ade28 13e8889 d7485e8 2115e8c 20d6d68 0f4d5d5 cce90fc 97f7d3e 6f4ddaf af8a888 6f4ddaf 0eb61e5 2115e8c 1a0e07b 2115e8c 6349b5a 6a4b9ce 6349b5a 6a4b9ce 6349b5a 6a4b9ce 6349b5a 2115e8c 20d6d68 b87bcef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import streamlit as st
import re
import PyPDF2
import matplotlib.pyplot as plt
import io
from wordcloud import WordCloud
from PIL import Image
from rouge import Rouge
from datasets import load_dataset
from extractive_summarization import summarize_with_textrank, summarize_with_lsa
from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
#from keyword_extraction import extract_keywords
from keyphrase_extraction import extract_sentences_with_obligations
from hybrid_summarization import summarize_hybrid
#-------------------------------------------------------------------#
# Load in ToS-Summaries dataset
dataset = load_dataset("EE21/ToS-Summaries")
# Extract titles or identifiers for the ToS
tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
# Set page to wide mode
st.set_page_config(layout="wide")
# Function to handle file upload and return its content
def load_pdf(file):
pdf_reader = PyPDF2.PdfReader(file)
pdf_text = ""
for page_num in range(len(pdf_reader.pages)):
pdf_text += pdf_reader.pages[page_num].extract_text() or ""
return pdf_text
# Main app
def main():
st.title("QuickToS - Terms of Service Summarizer")
# Layout: 3 columns
col1, col2, col3 = st.columns([1, 3, 2], gap="large")
# Left column: Radio buttons for summarizer choice
with col1:
radio_options = ["Hybrid (RAKE + BART Fine-tuned)", "Abstractive (LongT5)", "Abstractive (LED)", 'Abstractive (BART Fine-tuned)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)',
"Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)']
radio_selection = st.radio("Choose type of summarizer:", radio_options)
# Middle column: Text input and File uploader
with col2:
user_input = st.text_area("Enter a text")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
# Dropdown for selecting the document
tos_selection_index = st.selectbox("Select a Terms of Service Document", range(len(tos_titles)), format_func=lambda x: tos_titles[x])
if st.button("Summarize"):
if uploaded_file and user_input and tos_selection_index:
st.warning("Please provide either text input or a PDF file, not both.")
return
elif uploaded_file:
# Extract text from PDF
file_content = load_pdf(uploaded_file)
st.write("PDF uploaded successfully.")
elif user_input:
file_content = user_input
elif tos_selection_index is not None:
file_content = dataset['train'][tos_selection_index]['plain_text']
else:
st.warning("Please upload a PDF, enter some text, or select a document to summarize.")
return
# Perform hybrid summarization
if radio_selection == "Hybrid (RAKE + BART Fine-tuned)":
summary = summarize_hybrid(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Extractive (TextRank)":
summary = summarize_with_textrank(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Extractive (Latent Semantic Analysis)":
summary = summarize_with_lsa(file_content)
st.session_state.summary = summary
# Perform abstractive summarization
if radio_selection == "Abstractive (BART Fine-tuned)":
summary = summarize_with_bart_ft(file_content)
st.session_state.summary = summary
# Perform abstractive summarization
if radio_selection == "Abstractive (BART-large-CNN)":
summary = summarize_with_bart_cnn(file_content)
st.session_state.summary = summary
# Perform abstractive summarization
if radio_selection == "Abstractive (LongT5)":
summary = summarize_with_t5(file_content)
st.session_state.summary = summary
# Perform abstractive summarization
if radio_selection == "Abstractive (LED)":
summary = summarize_with_led(file_content)
st.session_state.summary = summary
# Perform Keyword Extraction
#if radio_selection == "Keyword Extraction (RAKE)":
# summary = extract_keywords(file_content)
# st.session_state.summary = summary
# Perform Keyphrase Extraction
if radio_selection == "Keyphrase Extraction (RAKE)":
summary = extract_sentences_with_obligations(file_content)
st.session_state.summary = summary
# Right column: Displaying text after pressing 'Summarize'
with col3:
st.write("Summary:")
if 'summary' in st.session_state:
st.write(st.session_state.summary)
# Generate and display word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=20).generate(st.session_state.summary)
# Convert to PIL Image
image = wordcloud.to_image()
# Convert PIL Image to bytes
buf = io.BytesIO()
image.save(buf, format='PNG')
byte_im = buf.getvalue()
st.image(byte_im, caption='Word Cloud of Summary', use_column_width=True)
# Check if no PDF or text input is provided and a ToS document is selected
if not uploaded_file and not user_input and tos_selection_index is not None and 'summary' in dataset['train'][tos_selection_index]:
# Fetch the reference summary
reference_summary = dataset['train'][tos_selection_index]['summary']
# Calculate ROUGE scores
rouge = Rouge()
scores = rouge.get_scores(st.session_state.summary, reference_summary)
# Display ROUGE scores as styled text
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-1: {scores[0]['rouge-1']['f']:.4f}</p>", unsafe_allow_html=True)
with col2:
st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-2: {scores[0]['rouge-2']['f']:.4f}</p>", unsafe_allow_html=True)
with col3:
st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-L: {scores[0]['rouge-l']['f']:.4f}</p>", unsafe_allow_html=True)
if __name__ == "__main__":
main()
|