|
import streamlit as st |
|
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer |
|
import torch |
|
from PyPDF2 import PdfReader |
|
|
|
|
|
st.subheader("File Summarization Tool") |
|
|
|
|
|
device = 0 if torch.cuda.is_available() else -1 |
|
|
|
|
|
try: |
|
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device) |
|
|
|
except Exception as e: |
|
st.error(f"Error loading model: {str(e)}") |
|
summarizer = None |
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
reader = PdfReader(pdf_file) |
|
text = "" |
|
for page_num in range(len(reader.pages)): |
|
page = reader.pages[page_num] |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
|
|
def extract_text_from_txt(txt_file): |
|
return txt_file.read().decode("utf-8") |
|
|
|
|
|
|
|
file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"]) |
|
|
|
|
|
user_text = st.text_area("Or write your text here:", "") |
|
|
|
if (file or user_text) and summarizer: |
|
try: |
|
|
|
if file: |
|
if file.type == "application/pdf": |
|
text = extract_text_from_pdf(file) |
|
elif file.type == "text/plain": |
|
text = extract_text_from_txt(file) |
|
else: |
|
st.error("Unsupported file type.") |
|
text = "" |
|
else: |
|
text = user_text |
|
|
|
if len(text) > 0: |
|
|
|
def split_text_into_chunks(text, chunk_size=512): |
|
words = text.split() |
|
for i in range(0, len(words), chunk_size): |
|
yield " ".join(words[i:i + chunk_size]) |
|
|
|
|
|
|
|
chunks = list(split_text_into_chunks(text)) |
|
summaries = [] |
|
|
|
|
|
for chunk in chunks: |
|
summarized_chunk = summarizer(chunk, max_length=130, min_length=30, do_sample=False) |
|
summaries.append(summarized_chunk[0]['summary_text']) |
|
|
|
|
|
summary = " ".join(summaries) |
|
|
|
|
|
st.subheader("Summary") |
|
st.write(summary) |
|
else: |
|
st.warning("No text could be extracted from the file or provided by the user.") |
|
|
|
except Exception as e: |
|
st.error(f"An error occurred during summarization: {str(e)}") |
|
|