File size: 1,789 Bytes
cba2ea4
5f7e3d8
c363ecf
 
cba2ea4
5173dd6
 
cba2ea4
 
5f7e3d8
c363ecf
 
 
5f7e3d8
cba2ea4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c98d7c6
 
cba2ea4
 
 
c98d7c6
 
 
cba2ea4
 
 
 
 
 
 
c98d7c6
5173dd6
 
 
c98d7c6
5173dd6
 
c98d7c6
cba2ea4
 
c98d7c6
cba2ea4
 
 
 
 
b2410fa
5173dd6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import pydub
from pydub import AudioSegment
import base64
import re

model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_first_sentence(text):
  """
  Extracts the first sentence from the given text.
  """
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
  if sentences:
    return sentences[0]
  else:
    return text

def summarize_pdf_abstract(pdf_file):
  """
  Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
  """
  try:
    reader = PdfReader(pdf_file)
    abstract_text = ""
    for page in reader.pages:
      if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
        abstract_text = page.extract_text()
        break

    inputs = tokenizer(abstract_text, return_tensors="pt")
    outputs = model.generate(**inputs)
    summary = tokenizer.decode(outputs[0])

    # Extract only the first sentence
    summary_sentence = extract_first_sentence(summary)

    # Generate audio
    speech = gTTS(text=summary_sentence, lang="en")

    # Save audio file
    audio_file = AudioSegment.from_file(speech, format="mp3")
    audio_file.export("summary.mp3", format="mp3")

    # Return summary and audio filename
    return summary_sentence, "summary.mp3"

  except Exception as e:
    raise Exception(str(e))

interface = gr.Interface(
  fn=summarize_pdf_abstract,
  inputs=[gr.File(label="Upload PDF")],
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
)

interface.launch(share=True)