"""from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from pysummarization.nlpbase.auto_abstractor import AutoAbstractor from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words""" import PyPDF2 from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer import nltk nltk.download('punkt') def summarize_pdf_with_textrank(pdf_path, sentences_count=10): """ Summarizes the content of a PDF file using TextRank algorithm. Args: pdf_path (str): Path to the PDF file. sentences_count (int): Number of sentences for the summary. Returns: str: Summarized text. """ # Extract text from the PDF pdf_text = "" with open(pdf_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) for page in pdf_reader.pages: pdf_text += page.extract_text() or "" # Check if text extraction was successful if not pdf_text.strip(): return "Text extraction from PDF failed or PDF is empty." # Create a parser for the extracted text parser = PlaintextParser.from_string(pdf_text, Tokenizer("english")) # Use TextRank for summarization text_rank_summarizer = TextRankSummarizer() text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count) # Compile summary into a single string summary_text = "\n".join(str(sentence) for sentence in text_rank_summary) return summary_text