File size: 6,336 Bytes
c0ae847
2d2d28b
03ddbfd
50d3158
7a228d7
432c28d
4b1dcc8
cce90fc
46193fd
8e106db
2d2d28b
 
 
 
c7464b2
2d2d28b
20d6d68
 
 
 
 
 
 
 
03ddbfd
20d6d68
 
 
 
 
 
 
2115e8c
20d6d68
03ddbfd
20d6d68
b3bc472
ffc170d
1918f01
3dd36dd
 
 
 
 
 
 
bfd0b51
2115e8c
20d6d68
 
b87bcef
20d6d68
2d2d28b
 
8e106db
2d2d28b
20d6d68
71f1303
2a61e91
 
 
03ddbfd
20d6d68
 
b87bcef
 
c7464b2
2d2d28b
20d6d68
2d2d28b
b87bcef
 
 
94d59bf
f497f7f
2a61e91
20d6d68
94d59bf
 
 
 
 
97f7d3e
da1f55e
1918f01
da1f55e
 
 
 
7a228d7
ccd2173
97f7d3e
b3bc472
 
 
 
 
 
eea15da
b3bc472
 
 
d7485e8
c0ade28
432c28d
 
 
d7485e8
c0ade28
13e8889
d7485e8
2115e8c
20d6d68
 
0f4d5d5
cce90fc
 
97f7d3e
2115e8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20d6d68
b87bcef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
from datasets import load_dataset
import PyPDF2
from extractive_summarization import summarize_with_textrank, summarize_with_lsa
from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
from keyword_extraction import extract_keywords
from keyphrase_extraction import extract_sentences_with_obligations
#from blanc import BlancHelp


# Load in ToS
dataset = load_dataset("EE21/ToS-Summaries")

# Extract titles or identifiers for the ToS
tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]

# Set page to wide mode
st.set_page_config(layout="wide")

# Function to handle file upload and return its content
def load_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    pdf_text = ""
    for page_num in range(len(pdf_reader.pages)):
        pdf_text += pdf_reader.pages[page_num].extract_text() or ""
    return pdf_text

# Main app
def main():
    st.title("Terms of Service Summarizer")

    # Layout: 3 columns
    col1, col2, col3 = st.columns([1, 2, 3], gap="large")

    # Left column: Radio buttons for summarizer choice
    with col1:
        radio_options = ["Abstractive (T5)", "Abstractive (LED)", 'Abstractive (Fine-tuned BART)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)', 
                         "Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)', 'Keyword Extraction (RAKE)']
        
        help_text = "Abstractive: Abstractive summarization generates a summary that may contain words not present in the original text. " \
                "It uses a fine-tuned model on BART-large-CNN.<br>" \
                "Extractive: Extractive summarization selects and extracts sentences or phrases directly from the original text to create a summary using the TextRank algorithm.<br>" \
                "Keyword Extraction: Keyword extraction identifies and extracts important keywords or terms from the text using the Rake algorithm. " \
                "These keywords can be used for various purposes such as content analysis and SEO.<br>" \
                "Keyphrase Extraction: Keyphrase extraction is similar to keyword extraction but focuses on identifying multi-word phrases or expressions that are significant in the text using the Rake algorithm."
        
        radio_selection = st.radio("Choose type of summarizer:", radio_options, help=help_text)

    # Middle column: Text input and File uploader
    with col2:
        user_input = st.text_area("Enter your text here:")
        uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
        
        # Dropdown for selecting the document
        tos_selection_index  = st.selectbox("Select a Terms of Service Document", range(len(tos_titles)), format_func=lambda x: tos_titles[x])
        
        if st.button("Summarize"):
            if uploaded_file and user_input and tos_selection_index:
                st.warning("Please provide either text input or a PDF file, not both.")
                return
            elif uploaded_file:
                # Extract text from PDF
                file_content = load_pdf(uploaded_file)
                st.write("PDF uploaded successfully.")
            elif user_input:
                file_content = user_input
            elif tos_selection_index is not None:
                file_content = dataset['train'][tos_selection_index]['plain_text']
            else:
                st.warning("Please upload a PDF, enter some text, or select a document to summarize.")
                return

            # Perform extractive summarization
            if radio_selection == "Extractive (TextRank)":
                summary = summarize_with_textrank(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Extractive (Latent Semantic Analysis)":
                summary = summarize_with_lsa(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Abstractive (Fine-tuned BART)":
                summary = summarize_with_bart_ft(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Abstractive (BART-large-CNN)":
                summary = summarize_with_bart_cnn(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Abstractive (T5)":
                summary = summarize_with_t5(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Abstractive (LED)":
                summary = summarize_with_led(file_content)
                st.session_state.summary = summary

            # Perform Keyword Extraction
            if radio_selection == "Keyword Extraction (RAKE)":
                summary = extract_keywords(file_content)
                st.session_state.summary = summary

            # Perform Keyphrase Extraction
            if radio_selection == "Keyphrase Extraction (RAKE)":
                summary = extract_sentences_with_obligations(file_content)
                st.session_state.summary = summary
    
    # Right column: Displaying text after pressing 'Summarize'
    with col3:
        st.write("Summary:")
        if 'summary' in st.session_state:
            st.write(st.session_state.summary)

            # Check if a reference summary is available
            if tos_selection_index is not None and 'summary' in dataset['train'][tos_selection_index]:
                # Fetch the reference summary
                reference_summary = data['train'][tos_selection_index]['summary']
    
                # Calculate ROUGE scores
                rouge = Rouge()
                scores = rouge.get_scores(st.session_state.summary, reference_summary)
    
                # Display ROUGE scores
                st.write("ROUGE Scores:")
                st.write(f"ROUGE-1: {scores[0]['rouge-1']['f']:.4f}")
                st.write(f"ROUGE-2: {scores[0]['rouge-2']['f']:.4f}")
                st.write(f"ROUGE-L: {scores[0]['rouge-l']['f']:.4f}")

if __name__ == "__main__":
    main()