File size: 1,789 Bytes
cba2ea4 5f7e3d8 c363ecf cba2ea4 5173dd6 cba2ea4 5f7e3d8 c363ecf 5f7e3d8 cba2ea4 c98d7c6 cba2ea4 c98d7c6 cba2ea4 c98d7c6 5173dd6 c98d7c6 5173dd6 c98d7c6 cba2ea4 c98d7c6 cba2ea4 b2410fa 5173dd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import pydub
from pydub import AudioSegment
import base64
import re
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def extract_first_sentence(text):
"""
Extracts the first sentence from the given text.
"""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
if sentences:
return sentences[0]
else:
return text
def summarize_pdf_abstract(pdf_file):
"""
Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
"""
try:
reader = PdfReader(pdf_file)
abstract_text = ""
for page in reader.pages:
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
abstract_text = page.extract_text()
break
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0])
# Extract only the first sentence
summary_sentence = extract_first_sentence(summary)
# Generate audio
speech = gTTS(text=summary_sentence, lang="en")
# Save audio file
audio_file = AudioSegment.from_file(speech, format="mp3")
audio_file.export("summary.mp3", format="mp3")
# Return summary and audio filename
return summary_sentence, "summary.mp3"
except Exception as e:
raise Exception(str(e))
interface = gr.Interface(
fn=summarize_pdf_abstract,
inputs=[gr.File(label="Upload PDF")],
outputs=[gr.Textbox(label="Summary"), gr.Audio()],
)
interface.launch(share=True)
|