testingspace / app.py
mrsk1883's picture
Update app.py
a06b80e
raw
history blame
1.69 kB
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def extract_first_sentence(text):
# Use a simple regex to extract the first sentence
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
if sentences:
return sentences[0]
else:
return text
def summarize_pdf_abstract(pdf_file):
try:
reader = PdfReader(pdf_file)
abstract_text = ""
for page in reader.pages:
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
abstract_text = page.extract_text()
break
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0])
# Extract only the first sentence
summary_sentence = extract_first_sentence(summary)
# Generate audio
speech = gTTS(text=summary_sentence, lang="en")
speech_bytes = BytesIO()
speech.write_to_fp(speech_bytes)
return {"summary": summary_sentence, "audio": speech_bytes}
except Exception as e:
raise Exception(str(e))
def play_audio(audio_bytes):
return gr.Audio(audio_bytes)
interface = gr.Interface(
fn=summarize_pdf_abstract,
inputs=[gr.File(label="Upload PDF")],
outputs=[gr.Textbox(label="Summary"), play_audio],
)
interface.launch(share=True)