File size: 15,382 Bytes
8bbb2aa
0a416bb
 
 
2e75ac3
 
90f1a74
 
 
 
 
 
 
1ef1dd3
 
 
 
 
3da38ea
1ef1dd3
 
90f1a74
1ef1dd3
90f1a74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc681e
 
8342da0
de2441d
fd5777c
8bbb2aa
1f39b2c
8bbb2aa
 
ad87511
 
8bbb2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c63b278
 
8bbb2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cb41d9
 
8bbb2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda3639
8bbb2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef1dd3
8bbb2aa
 
5aa34e5
 
8bbb2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87eb59b
 
 
8bbb2aa
87eb59b
 
 
 
 
 
 
 
670bd75
2cdf061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bbb2aa
2cdf061
 
 
 
 
 
 
 
 
8bbb2aa
90f1a74
 
 
6c21fa3
90f1a74
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# https://huggingface.co./spaces/Mishmosh/MichelleAssessment3
# I was having great difficulty getting any code to run without errors. Finally when it was working I ran out of time to complete the task
# The code receives a PDF but doesnt' appear to process it.
# I still need to add the gradio interface output to show the summarized text and play the sound file
import gradio as gr

# Interface for displaying the summarized text
summarized_textbox = gr.Textbox(type="text", label="Summarized Text")

# Interface for playing the speech
speech_audio = gr.Audio(type="file", label="Text-to-Speech Audio", element_id="audio_element")

# Interface to process input and display results
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.File(
            type="binary",
            label="Hello. This app is called Abstract Summariser and gives a one sentence summary of the input PDF in both written and spoken form. Please upload a PDF file that contains an abstract.",
        ),
    ],
    outputs=[summarized_textbox, speech_audio],  # Display the summarized text and audio
)




def process_input(pdf_file):
    print("Received PDF File:", pdf_file.name)
    # Read the content of the uploaded PDF file
    pdf_content = pdf_file.read()
    # Save the received PDF content locally
    with open("received_pdf.pdf", "wb") as output_file:
        output_file.write(pdf_content)
    # Return the content of the processed PDF file
    return pdf_content
    
###commented out latest version
#iface = gr.Interface(
 #   fn=process_input,
  #  inputs=[
   #     gr.File(
    #        type="binary",
     #       label="Hello. This app is called Abstract Summariser and gives a one sentence summary of the input PDF in both written and spoken form. Please upload a PDF file that contains an abstract.",
      #  ),
#    ],
 #   outputs=None, 
#)
iface.launch(share=True)
#iface.launch()
#python app.py 
#python -m pip install --upgrade pip
#pip install torch torchvision torchaudio tensorflow
# Install Rust 
#RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
#RUN python -m pip install --upgrade pip
#pip install --upgrade pip
#RUN pip install --no-cache-dir -r requirements.txt
#RUN pip install --use-feature=in-tree-build tokenizers
#!pip install PyPDF2
#!pip install sentencepiece
#!pip install pdfminer.six
#!pip install pdfplumber
#!pip install pdf2image
#!pip install Pillow
#!pip install pytesseract
# @title
#!apt-get install poppler-utils
#!apt install tesseract-ocr
#!apt install libtesseract-dev
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import os
#pdf_path="received_pdf.pdf"
pdf_path=pdf_content
def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()

    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))

    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)
# @title
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
    # Get the coordinates to crop the image from the PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = "PDF_image.png"
    image.save(output_file, "PNG")

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text
# @title
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
#    pdf = received_pdf.pdf
    pdf = pdf_path
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string
# @title
def read_pdf(pdf_path):
  # create a PDF file object
  pdfFileObj = open(pdf_path, 'rb')
  # create a PDF reader object
  #pdfReaded = PyPDF2.PdfReader(pdfFileObj) #coded out as suggested by chatgpt
  pdfReaded = PyPDF2.PdfFileReader(pdfFileObj)

  # Create the dictionary to extract text from each image
  text_per_page = {}
  # We extract the pages from the PDF
  for pagenum, page in enumerate(extract_pages(pdf_path)):
      print("Elaborating Page_" +str(pagenum))
      # Initialize the variables needed for the text extraction from the page
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      text_from_images = []
      text_from_tables = []
      page_content = []
      # Initialize the number of the examined tables
      table_num = 0
      first_element= True
      table_extraction_flag= False
      # Open the pdf file
      pdf = pdfplumber.open(pdf_path)
      # Find the examined page
      page_tables = pdf.pages[pagenum]
      # Find the number of tables on the page
      tables = page_tables.find_tables()


      # Find all the elements
      page_elements = [(element.y1, element) for element in page._objs]
      # Sort all the elements as they appear in the page
      page_elements.sort(key=lambda a: a[0], reverse=True)

      # Find the elements that composed a page
      for i,component in enumerate(page_elements):
          # Extract the position of the top side of the element in the PDF
          pos= component[0]
          # Extract the element of the page layout
          element = component[1]

          # Check if the element is a text element
          if isinstance(element, LTTextContainer):
              # Check if the text appeared in a table
              if table_extraction_flag == False:
                  # Use the function to extract the text and format for each text element
                  (line_text, format_per_line) = text_extraction(element)
                  # Append the text of each line to the page text
                  page_text.append(line_text)
                  # Append the format for each line containing text
                  line_format.append(format_per_line)
                  page_content.append(line_text)
              else:
                  # Omit the text that appeared in a table
                  pass

          # Check the elements for images
          if isinstance(element, LTFigure):
              # Crop the image from the PDF
              crop_image(element, pageObj)
              # Convert the cropped pdf to an image
              convert_to_images('cropped_image.pdf')
              # Extract the text from the image
              image_text = image_to_text('PDF_image.png')
              text_from_images.append(image_text)
              page_content.append(image_text)
              # Add a placeholder in the text and format lists
              page_text.append('image')
              line_format.append('image')

          # Check the elements for tables
          if isinstance(element, LTRect):
              # If the first rectangular element
              if first_element == True and (table_num+1) <= len(tables):
                  # Find the bounding box of the table
                  lower_side = page.bbox[3] - tables[table_num].bbox[3]
                  upper_side = element.y1
                  # Extract the information from the table
                  table = extract_table(pdf_path, pagenum, table_num)
                  # Convert the table information in structured string format
                  table_string = table_converter(table)
                  # Append the table string into a list
                  text_from_tables.append(table_string)
                  page_content.append(table_string)
                  # Set the flag as True to avoid the content again
                  table_extraction_flag = True
                  # Make it another element
                  first_element = False
                  # Add a placeholder in the text and format lists
                  page_text.append('table')
                  line_format.append('table')

                  # Check if we already extracted the tables from the page
                  if element.y0 >= lower_side and element.y1 <= upper_side:
                      pass
                  elif not isinstance(page_elements[i+1][1], LTRect):
                      table_extraction_flag = False
                      first_element = True
                      table_num+=1


      # Create the key of the dictionary
      dctkey = 'Page_'+str(pagenum)
      # Add the list of list as the value of the page key
      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

  # Closing the pdf file object
  pdfFileObj.close()

  # Deleting the additional files created
  #os.remove('cropped_image.pdf')
  #os.remove('PDF_image.png')
  return text_per_page

#google drive
#from google.colab import drive
#drive.mount('/content/drive')
#read PDF

#pdf_path = 'test.pdf' #article 11
#pdf_path = 'https://huggingface.co./spaces/Mishmosh/MichelleAssessment3/blob/main/test.pdf' #article 11

#text_per_page = read_pdf(received_pdf.pdf)
text_per_page = read_pdf(pdf_content)

# This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
#to try this here since the formatting of the text has already been removed.
#  Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
abstract_from_pdf='' # define empty variable that will hold the text from the abstract
found_abstract=False # has the abstract been found
for key in text_per_page.keys(): # go through keys in dictionary
  current_item=text_per_page[key] #current key
  for paragraphs in current_item: #go through each item
    for index,paragraph in enumerate(paragraphs): #go through each line
      if 'Abstract\n' == paragraph: #does line match paragraph
        found_abstract=True #word abstract has been found
        abstract_from_pdf=paragraphs[index+1] #get next paragraph
    if found_abstract: #if abstract found
      break
print(abstract_from_pdf)

from transformers import pipeline
summarizer = pipeline("summarization", model="ainize/bart-base-cnn")
#summarizer = pipeline("summarization", model="linydub/bart-large-samsum") # various models were tried and the best one was selected
#summarizer = pipeline("summarization", model="slauw87/bart_summarisation")
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer = pipeline("summarization", model="google/pegasus-cnn_dailymail")
#print(summarizer(abstract_from_pdf, max_length=50, min_length=5, do_sample=False))
summarized_text=(summarizer(abstract_from_pdf))
print(summarized_text)
#summary_of_abstract=str(summarizer)
#type(summary_of_abstract)
#print(summary_of_abstract)

# the aim of this section of code is to get a summary of just one sentence by summarizing the summary all while the summary is longer than one sentence.
# unfortunately, I tried many many models and none of them actually summarize the text to as short as one sentence.
#I had searched for ways to fine tune the summarization model to specify that the summarization should be done in just one sentence but did not find a way to implement it
#from transformers import pipeline
#summarized_text_list_list=summarized_text_list['summary_text']
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#print(summarizer)
#number_of_sentences=summarized_text_list_list.count('.')
#print(number_of_sentences)
#while(number_of_sentences)>1:
 #   print(number_of_sentences)
  #  summarized_text_list_list=summarizer(summarized_text_list_list)[0]['summary_text']
   # number_of_sentences-=1
    #print(summarized_text_list_list)
    #print(number_of_sentences)

#text to speech
#!pip install git+https://github.com/huggingface/transformers.git
#!pip install datasets sentencepiece
import torch
#import soundfile as sf
#from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
#text = "The future belongs to those who believe in the beauty of their dreams."
#text = (summarized_text_list_list)
text = (summarized_text)

#inputs = processor(text=summarized_text_list_list, return_tensors="pt")
#inputs = processor("Michelletest", return_tensors="pt")
inputs = processor(text, return_tensors="pt")
from datasets import load_dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

import torch
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
from transformers import SpeechT5HifiGan
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
    speech = vocoder(spectrogram)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
Audio(speech, rate=16000)

#new code
summarized_text = summarize_abstract(abstract_from_pdf)
    # Set the value of the summarized_textbox
summarized_textbox.value = summarized_text

speech_audio.file = audio_path