import gradio as gr from PyPDF2 import PdfReader from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer from gtts import gTTS from io import BytesIO import re import os # Load the LED-large model for summarization model_name = "pszemraj/led-large-book-summary" summarizer = pipeline("summarization", model=model_name, tokenizer=model_name) def extract_abstract_and_summarize(pdf_file): try: if pdf_file is None: raise ValueError("PDF file is not provided.") with open(pdf_file, "rb") as file: pdf_reader = PdfReader(file) abstract_text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text = page.extract_text() abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) if abstract_match: start_index = abstract_match.end() introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) if introduction_match: end_index = start_index + introduction_match.start() else: end_index = None abstract_text = text[start_index:end_index] break # Summarize the extracted abstract using the LED-large model with a specific max_length result = summarizer(abstract_text, max_length=81) # Extract only the first sentence from the summary if result and isinstance(result, list) and len(result) > 0: summary = result[0].get('summary_text', 'Summary not available.') # Extracting the first sentence first_sentence = summary.split('.')[0] + '.' else: first_sentence = "Summary not available." # Generate audio speech = gTTS(text=first_sentence, lang="en") speech_bytes = BytesIO() speech.write_to_fp(speech_bytes) # Return individual output values return first_sentence, speech_bytes.getvalue(), abstract_text.strip() except Exception as e: raise Exception(str(e)) interface = gr.Interface( fn=extract_abstract_and_summarize, inputs=[gr.File(label="Upload PDF")], outputs=[gr.Textbox(label="Summary"), gr.Audio()], title="PDF Summarization & Audio Tool", description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""", examples=[[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],[os.path.join(os.path.dirname(__file__), "Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf")]],cache_examples=True, ) interface.launch()