File size: 2,769 Bytes
7bb7c6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import streamlit as st
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from PyPDF2 import PdfReader
# Load the summarization pipeline (Hugging Face model)
st.subheader("File Summarization Tool")
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
# Use a more general model loading approach for better error handling
try:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
#summarizer = pipeline("summarization", model="facebook/bart-large", device=device)
except Exception as e:
st.error(f"Error loading model: {str(e)}")
summarizer = None
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text()
return text
# Function to extract text from a TXT file
def extract_text_from_txt(txt_file):
return txt_file.read().decode("utf-8")
# Streamlit file uploader
file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])
# Text input area for user-provided text
user_text = st.text_area("Or write your text here:", "")
if (file or user_text) and summarizer:
try:
# Extract text based on input type
if file:
if file.type == "application/pdf":
text = extract_text_from_pdf(file)
elif file.type == "text/plain":
text = extract_text_from_txt(file)
else:
st.error("Unsupported file type.")
text = ""
else:
text = user_text
if len(text) > 0:
# Function to split the text into chunks of a fixed size
def split_text_into_chunks(text, chunk_size=512):
words = text.split()
for i in range(0, len(words), chunk_size):
yield " ".join(words[i:i + chunk_size])
# Split the text into chunks
chunks = list(split_text_into_chunks(text))
summaries = []
# Summarize each chunk
for chunk in chunks:
summarized_chunk = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
summaries.append(summarized_chunk[0]['summary_text'])
# Combine summaries from all chunks
summary = " ".join(summaries)
# Display the summary
st.subheader("Summary")
st.write(summary)
else:
st.warning("No text could be extracted from the file or provided by the user.")
except Exception as e:
st.error(f"An error occurred during summarization: {str(e)}")
|