import streamlit as st import re import PyPDF2 import matplotlib.pyplot as plt import io from wordcloud import WordCloud from PIL import Image from rouge import Rouge from datasets import load_dataset from extractive_summarization import summarize_with_textrank, summarize_with_lsa from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5 #from keyword_extraction import extract_keywords from keyphrase_extraction import extract_sentences_with_obligations from hybrid_summarization import summarize_hybrid #-------------------------------------------------------------------# # Load in ToS-Summaries dataset dataset = load_dataset("EE21/ToS-Summaries") # Extract titles or identifiers for the ToS tos_titles = [f"Document {i}" for i in range(len(dataset['train']))] # Set page to wide mode st.set_page_config(layout="wide") # Function to handle file upload and return its content def load_pdf(file): pdf_reader = PyPDF2.PdfReader(file) pdf_text = "" for page_num in range(len(pdf_reader.pages)): pdf_text += pdf_reader.pages[page_num].extract_text() or "" return pdf_text # Main app def main(): st.title("QuickToS - Terms of Service Summarizer") # Layout: 3 columns col1, col2, col3 = st.columns([1, 3, 2], gap="large") # Left column: Radio buttons for summarizer choice with col1: radio_options = ["Hybrid (RAKE + BART Fine-tuned)", "Abstractive (LongT5)", "Abstractive (LED)", 'Abstractive (BART Fine-tuned)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)', "Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)'] radio_selection = st.radio("Choose type of summarizer:", radio_options) # Middle column: Text input and File uploader with col2: user_input = st.text_area("Enter a text") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") # Dropdown for selecting the document tos_selection_index = st.selectbox("Select a Terms of Service Document", range(len(tos_titles)), format_func=lambda x: tos_titles[x]) if st.button("Summarize"): if uploaded_file and user_input and tos_selection_index: st.warning("Please provide either text input or a PDF file, not both.") return elif uploaded_file: # Extract text from PDF file_content = load_pdf(uploaded_file) st.write("PDF uploaded successfully.") elif user_input: file_content = user_input elif tos_selection_index is not None: file_content = dataset['train'][tos_selection_index]['plain_text'] else: st.warning("Please upload a PDF, enter some text, or select a document to summarize.") return # Perform hybrid summarization if radio_selection == "Hybrid (RAKE + BART Fine-tuned)": summary = summarize_hybrid(file_content) st.session_state.summary = summary # Perform extractive summarization if radio_selection == "Extractive (TextRank)": summary = summarize_with_textrank(file_content) st.session_state.summary = summary # Perform extractive summarization if radio_selection == "Extractive (Latent Semantic Analysis)": summary = summarize_with_lsa(file_content) st.session_state.summary = summary # Perform abstractive summarization if radio_selection == "Abstractive (BART Fine-tuned)": summary = summarize_with_bart_ft(file_content) st.session_state.summary = summary # Perform abstractive summarization if radio_selection == "Abstractive (BART-large-CNN)": summary = summarize_with_bart_cnn(file_content) st.session_state.summary = summary # Perform abstractive summarization if radio_selection == "Abstractive (LongT5)": summary = summarize_with_t5(file_content) st.session_state.summary = summary # Perform abstractive summarization if radio_selection == "Abstractive (LED)": summary = summarize_with_led(file_content) st.session_state.summary = summary # Perform Keyword Extraction #if radio_selection == "Keyword Extraction (RAKE)": # summary = extract_keywords(file_content) # st.session_state.summary = summary # Perform Keyphrase Extraction if radio_selection == "Keyphrase Extraction (RAKE)": summary = extract_sentences_with_obligations(file_content) st.session_state.summary = summary # Right column: Displaying text after pressing 'Summarize' with col3: st.write("Summary:") if 'summary' in st.session_state: st.write(st.session_state.summary) # Generate and display word cloud wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=20).generate(st.session_state.summary) # Convert to PIL Image image = wordcloud.to_image() # Convert PIL Image to bytes buf = io.BytesIO() image.save(buf, format='PNG') byte_im = buf.getvalue() st.image(byte_im, caption='Word Cloud of Summary', use_column_width=True) # Check if no PDF or text input is provided and a ToS document is selected if not uploaded_file and not user_input and tos_selection_index is not None and 'summary' in dataset['train'][tos_selection_index]: # Fetch the reference summary reference_summary = dataset['train'][tos_selection_index]['summary'] # Calculate ROUGE scores rouge = Rouge() scores = rouge.get_scores(st.session_state.summary, reference_summary) # Display ROUGE scores as styled text col1, col2, col3 = st.columns(3) with col1: st.markdown(f"
ROUGE-1: {scores[0]['rouge-1']['f']:.4f}
", unsafe_allow_html=True) with col2: st.markdown(f"ROUGE-2: {scores[0]['rouge-2']['f']:.4f}
", unsafe_allow_html=True) with col3: st.markdown(f"ROUGE-L: {scores[0]['rouge-l']['f']:.4f}
", unsafe_allow_html=True) if __name__ == "__main__": main()