mrsk1883 commited on
Commit
cba2ea4
·
1 Parent(s): c98d7c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -64
app.py CHANGED
@@ -1,82 +1,62 @@
 
1
  from PyPDF2 import PdfReader
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
  from gtts import gTTS
4
- import os
 
 
5
 
6
- # Download the model and tokenizer
7
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
-
12
- def summarize_and_speak_pdf_abstract(pdf_path):
13
- """
14
- Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
15
-
16
- Args:
17
- pdf_path: Path to the PDF file.
18
- """
19
-
20
- # Summarize the abstract
21
- summary = summarize_pdf_abstract(pdf_path)
22
-
23
- # Define language and audio format
24
- language = "en" # Change this to your desired language
25
- audio_format = "mp3"
26
-
27
- # Create the text-to-speech object
28
- tts = gTTS(text=summary, lang=language)
29
-
30
- # Generate the audio file
31
- audio_file_name = f"summary.{audio_format}"
32
- tts.save(audio_file_name)
33
-
34
- print(f"Audio file created: {audio_file_name}")
35
-
36
- # Play the audio file (optional)
37
- # os.system(f"play {audio_file_name}")
38
-
39
-
40
- # Define the function to summarize the abstract
41
- def summarize_pdf_abstract(pdf_path):
42
- """
43
- Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
44
-
45
- Args:
46
- pdf_path: Path to the PDF file.
47
-
48
- Returns:
49
- A string containing the one-sentence summary of the abstract.
50
- """
51
-
52
- # Read the PDF file
53
- reader = PdfReader(open(pdf_path, "rb"))
54
-
55
- # Extract the abstract
56
  abstract_text = ""
57
  for page in reader.pages:
58
- # Search for keywords like "Abstract" or "Introduction"
59
- if (
60
- "Abstract" in page.extract_text()
61
- or "Introduction" in page.extract_text()
62
- ):
63
- # Extract the text following the keyword
64
- abstract_text = page.extract_text()
65
- break
66
 
67
- # Encode the abstract text
68
  inputs = tokenizer(abstract_text, return_tensors="pt")
69
-
70
- # Generate the summary
71
  outputs = model.generate(**inputs)
 
 
 
 
 
 
 
 
 
72
 
73
- # Decode the summary
74
- summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
 
76
- return summary
 
77
 
 
 
78
 
79
- # Example usage
80
- pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
81
- summarize_and_speak_pdf_abstract(pdf_path)
 
 
82
 
 
 
1
+ import gradio as gr
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
+ from io import BytesIO
6
+ import base64
7
+ import re
8
 
 
9
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
10
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
+ def extract_first_sentence(text):
14
+ """
15
+ Extracts the first sentence from the given text.
16
+ """
17
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
18
+ if sentences:
19
+ return sentences[0]
20
+ else:
21
+ return text
22
+
23
+ def summarize_pdf_abstract(pdf_file):
24
+ """
25
+ Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
26
+ """
27
+ try:
28
+ reader = PdfReader(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  abstract_text = ""
30
  for page in reader.pages:
31
+ if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
32
+ abstract_text = page.extract_text()
33
+ break
 
 
 
 
 
34
 
 
35
  inputs = tokenizer(abstract_text, return_tensors="pt")
 
 
36
  outputs = model.generate(**inputs)
37
+ summary = tokenizer.decode(outputs[0])
38
+
39
+ # Extract only the first sentence
40
+ summary_sentence = extract_first_sentence(summary)
41
+
42
+ # Generate audio
43
+ speech = gTTS(text=summary_sentence, lang="en")
44
+ speech_bytes = BytesIO()
45
+ speech.write_to_fp(speech_bytes)
46
 
47
+ # Encode audio data with Base64
48
+ audio_data = base64.b64encode(speech_bytes.getvalue()).decode("utf-8")
49
 
50
+ # Return individual output values
51
+ return summary_sentence, audio_data
52
 
53
+ except Exception as e:
54
+ raise Exception(str(e))
55
 
56
+ interface = gr.Interface(
57
+ fn=summarize_pdf_abstract,
58
+ inputs=[gr.File(label="Upload PDF")],
59
+ outputs=[gr.Textbox(label="Summary"), gr.Audio()],
60
+ )
61
 
62
+ interface.launch(share=True)