mrsk1883 commited on
Commit
099e779
·
1 Parent(s): 45ca8c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -40
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
- from io import BytesIO
6
  import re
7
 
8
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
@@ -10,49 +9,42 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
 
12
  def extract_first_sentence(text):
13
- """
14
- Extracts the first sentence from the given text.
15
- """
16
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
17
- if sentences:
18
- return sentences[0]
19
- else:
20
- return text
21
 
22
  def summarize_pdf_abstract(pdf_file):
23
- """
24
- Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
25
- """
26
- try:
27
- reader = PdfReader(pdf_file)
28
- abstract_text = ""
29
- for page in reader.pages:
30
- if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
31
- abstract_text = page.extract_text()
32
- break
33
-
34
- inputs = tokenizer(abstract_text, return_tensors="pt")
35
- outputs = model.generate(**inputs)
36
- summary = tokenizer.decode(outputs[0])
37
-
38
- # Extract only the first sentence
39
- summary_sentence = extract_first_sentence(summary)
40
-
41
- # Generate audio
42
- speech = gTTS(text=summary_sentence, lang="en")
43
- speech_bytes = BytesIO()
44
- speech.write_to_fp(speech_bytes)
45
-
46
- # Return individual output values
47
- return summary_sentence, speech_bytes
48
-
49
- except Exception as e:
50
- raise Exception(str(e))
51
 
52
  interface = gr.Interface(
53
- fn=summarize_pdf_abstract,
54
- inputs=[gr.File(label="Upload PDF")],
55
- outputs=[gr.Textbox(label="Summary"), gr.Audio()],
56
  )
57
 
58
  interface.launch(share=True)
 
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
 
5
  import re
6
 
7
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
  def extract_first_sentence(text):
12
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
13
+ if sentences:
14
+ return sentences[0]
15
+ else:
16
+ return text
 
 
 
17
 
18
  def summarize_pdf_abstract(pdf_file):
19
+ try:
20
+ reader = PdfReader(pdf_file)
21
+ abstract_text = ""
22
+ for page in reader.pages:
23
+ if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
24
+ abstract_text = page.extract_text()
25
+ break
26
+
27
+ inputs = tokenizer(abstract_text, return_tensors="pt")
28
+ outputs = model.generate(**inputs)
29
+ summary = tokenizer.decode(outputs[0])
30
+
31
+ # Extract only the first sentence
32
+ summary_sentence = extract_first_sentence(summary)
33
+
34
+ # Generate audio
35
+ speech = gTTS(text=summary_sentence, lang="en")
36
+ speech_bytes = speech.save_to_fp(BytesIO())
37
+
38
+ # Return individual output values
39
+ return summary_sentence, speech_bytes.getvalue()
40
+
41
+ except Exception as e:
42
+ raise Exception(str(e))
 
 
 
 
43
 
44
  interface = gr.Interface(
45
+ fn=summarize_pdf_abstract,
46
+ inputs=[gr.File(label="Upload PDF")],
47
+ outputs=[gr.Textbox(label="Summary"), gr.Audio()],
48
  )
49
 
50
  interface.launch(share=True)