Mishmosh commited on
Commit
59dc3f5
1 Parent(s): 5c700c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +294 -1
app.py CHANGED
@@ -1,2 +1,295 @@
1
  # https://huggingface.co/spaces/Mishmosh/MichelleAssessment3
2
- import PyPDF2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # https://huggingface.co/spaces/Mishmosh/MichelleAssessment3
2
+ !pip install PyPDF2
3
+ !pip install sentencepiece
4
+ !pip install pdfminer.six
5
+ !pip install pdfplumber
6
+ !pip install pdf2image
7
+ !pip install Pillow
8
+ !pip install pytesseract
9
+ # @title
10
+ !apt-get install poppler-utils
11
+ !apt install tesseract-ocr
12
+ !apt install libtesseract-dev
13
+ import PyPDF2
14
+ from pdfminer.high_level import extract_pages, extract_text
15
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
16
+ import pdfplumber
17
+ from PIL import Image
18
+ from pdf2image import convert_from_path
19
+ import pytesseract
20
+ import os
21
+ def text_extraction(element):
22
+ # Extracting the text from the in-line text element
23
+ line_text = element.get_text()
24
+
25
+ # Find the formats of the text
26
+ # Initialize the list with all the formats that appeared in the line of text
27
+ line_formats = []
28
+ for text_line in element:
29
+ if isinstance(text_line, LTTextContainer):
30
+ # Iterating through each character in the line of text
31
+ for character in text_line:
32
+ if isinstance(character, LTChar):
33
+ # Append the font name of the character
34
+ line_formats.append(character.fontname)
35
+ # Append the font size of the character
36
+ line_formats.append(character.size)
37
+ # Find the unique font sizes and names in the line
38
+ format_per_line = list(set(line_formats))
39
+
40
+ # Return a tuple with the text in each line along with its format
41
+ return (line_text, format_per_line)
42
+ # @title
43
+ # Create a function to crop the image elements from PDFs
44
+ def crop_image(element, pageObj):
45
+ # Get the coordinates to crop the image from the PDF
46
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
47
+ # Crop the page using coordinates (left, bottom, right, top)
48
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
49
+ pageObj.mediabox.upper_right = (image_right, image_top)
50
+ # Save the cropped page to a new PDF
51
+ cropped_pdf_writer = PyPDF2.PdfWriter()
52
+ cropped_pdf_writer.add_page(pageObj)
53
+ # Save the cropped PDF to a new file
54
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
55
+ cropped_pdf_writer.write(cropped_pdf_file)
56
+
57
+ # Create a function to convert the PDF to images
58
+ def convert_to_images(input_file,):
59
+ images = convert_from_path(input_file)
60
+ image = images[0]
61
+ output_file = "PDF_image.png"
62
+ image.save(output_file, "PNG")
63
+
64
+ # Create a function to read text from images
65
+ def image_to_text(image_path):
66
+ # Read the image
67
+ img = Image.open(image_path)
68
+ # Extract the text from the image
69
+ text = pytesseract.image_to_string(img)
70
+ return text
71
+ # @title
72
+ # Extracting tables from the page
73
+
74
+ def extract_table(pdf_path, page_num, table_num):
75
+ # Open the pdf file
76
+ pdf = pdfplumber.open(pdf_path)
77
+ # Find the examined page
78
+ table_page = pdf.pages[page_num]
79
+ # Extract the appropriate table
80
+ table = table_page.extract_tables()[table_num]
81
+ return table
82
+
83
+ # Convert table into the appropriate format
84
+ def table_converter(table):
85
+ table_string = ''
86
+ # Iterate through each row of the table
87
+ for row_num in range(len(table)):
88
+ row = table[row_num]
89
+ # Remove the line breaker from the wrapped texts
90
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
91
+ # Convert the table into a string
92
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
93
+ # Removing the last line break
94
+ table_string = table_string[:-1]
95
+ return table_string
96
+ # @title
97
+ def read_pdf(pdf_path):
98
+ # create a PDF file object
99
+ pdfFileObj = open(pdf_path, 'rb')
100
+ # create a PDF reader object
101
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
102
+
103
+ # Create the dictionary to extract text from each image
104
+ text_per_page = {}
105
+ # We extract the pages from the PDF
106
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
107
+ print("Elaborating Page_" +str(pagenum))
108
+ # Initialize the variables needed for the text extraction from the page
109
+ pageObj = pdfReaded.pages[pagenum]
110
+ page_text = []
111
+ line_format = []
112
+ text_from_images = []
113
+ text_from_tables = []
114
+ page_content = []
115
+ # Initialize the number of the examined tables
116
+ table_num = 0
117
+ first_element= True
118
+ table_extraction_flag= False
119
+ # Open the pdf file
120
+ pdf = pdfplumber.open(pdf_path)
121
+ # Find the examined page
122
+ page_tables = pdf.pages[pagenum]
123
+ # Find the number of tables on the page
124
+ tables = page_tables.find_tables()
125
+
126
+
127
+ # Find all the elements
128
+ page_elements = [(element.y1, element) for element in page._objs]
129
+ # Sort all the elements as they appear in the page
130
+ page_elements.sort(key=lambda a: a[0], reverse=True)
131
+
132
+ # Find the elements that composed a page
133
+ for i,component in enumerate(page_elements):
134
+ # Extract the position of the top side of the element in the PDF
135
+ pos= component[0]
136
+ # Extract the element of the page layout
137
+ element = component[1]
138
+
139
+ # Check if the element is a text element
140
+ if isinstance(element, LTTextContainer):
141
+ # Check if the text appeared in a table
142
+ if table_extraction_flag == False:
143
+ # Use the function to extract the text and format for each text element
144
+ (line_text, format_per_line) = text_extraction(element)
145
+ # Append the text of each line to the page text
146
+ page_text.append(line_text)
147
+ # Append the format for each line containing text
148
+ line_format.append(format_per_line)
149
+ page_content.append(line_text)
150
+ else:
151
+ # Omit the text that appeared in a table
152
+ pass
153
+
154
+ # Check the elements for images
155
+ if isinstance(element, LTFigure):
156
+ # Crop the image from the PDF
157
+ crop_image(element, pageObj)
158
+ # Convert the cropped pdf to an image
159
+ convert_to_images('cropped_image.pdf')
160
+ # Extract the text from the image
161
+ image_text = image_to_text('PDF_image.png')
162
+ text_from_images.append(image_text)
163
+ page_content.append(image_text)
164
+ # Add a placeholder in the text and format lists
165
+ page_text.append('image')
166
+ line_format.append('image')
167
+
168
+ # Check the elements for tables
169
+ if isinstance(element, LTRect):
170
+ # If the first rectangular element
171
+ if first_element == True and (table_num+1) <= len(tables):
172
+ # Find the bounding box of the table
173
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
174
+ upper_side = element.y1
175
+ # Extract the information from the table
176
+ table = extract_table(pdf_path, pagenum, table_num)
177
+ # Convert the table information in structured string format
178
+ table_string = table_converter(table)
179
+ # Append the table string into a list
180
+ text_from_tables.append(table_string)
181
+ page_content.append(table_string)
182
+ # Set the flag as True to avoid the content again
183
+ table_extraction_flag = True
184
+ # Make it another element
185
+ first_element = False
186
+ # Add a placeholder in the text and format lists
187
+ page_text.append('table')
188
+ line_format.append('table')
189
+
190
+ # Check if we already extracted the tables from the page
191
+ if element.y0 >= lower_side and element.y1 <= upper_side:
192
+ pass
193
+ elif not isinstance(page_elements[i+1][1], LTRect):
194
+ table_extraction_flag = False
195
+ first_element = True
196
+ table_num+=1
197
+
198
+
199
+ # Create the key of the dictionary
200
+ dctkey = 'Page_'+str(pagenum)
201
+ # Add the list of list as the value of the page key
202
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
203
+
204
+ # Closing the pdf file object
205
+ pdfFileObj.close()
206
+
207
+ # Deleting the additional files created
208
+ #os.remove('cropped_image.pdf')
209
+ #os.remove('PDF_image.png')
210
+ return text_per_page
211
+
212
+ #google drive
213
+ from google.colab import drive
214
+ drive.mount('/content/drive')
215
+ #read PDF
216
+
217
+ pdf_path = '/content/drive/MyDrive/ArticleHidden.pdf' #article 11
218
+
219
+ text_per_page = read_pdf(pdf_path)
220
+
221
+ # This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
222
+ #to try this here since the formatting of the text has already been removed.
223
+ # Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
224
+ abstract_from_pdf='' # define empty variable that will hold the text from the abstract
225
+ found_abstract=False # has the abstract been found
226
+ for key in text_per_page.keys(): # go through keys in dictionary
227
+ current_item=text_per_page[key] #current key
228
+ for paragraphs in current_item: #go through each item
229
+ for index,paragraph in enumerate(paragraphs): #go through each line
230
+ if 'Abstract\n' == paragraph: #does line match paragraph
231
+ found_abstract=True #word abstract has been found
232
+ abstract_from_pdf=paragraphs[index+1] #get next paragraph
233
+ if found_abstract: #if abstract found
234
+ break
235
+ print(abstract_from_pdf)
236
+
237
+ from transformers import pipeline
238
+ summarizer = pipeline("summarization", model="ainize/bart-base-cnn")
239
+ #summarizer = pipeline("summarization", model="linydub/bart-large-samsum") # various models were tried and the best one was selected
240
+ #summarizer = pipeline("summarization", model="slauw87/bart_summarisation")
241
+ #summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
242
+ #summarizer = pipeline("summarization", model="google/pegasus-cnn_dailymail")
243
+ #print(summarizer(abstract_from_pdf, max_length=50, min_length=5, do_sample=False))
244
+ summarized_text=(summarizer(abstract_from_pdf))
245
+ print(summarized_text)
246
+ #summary_of_abstract=str(summarizer)
247
+ #type(summary_of_abstract)
248
+ #print(summary_of_abstract)
249
+
250
+ # the aim of this section of code is to get a summary of just one sentence by summarizing the summary all while the summary is longer than one sentence.
251
+ # unfortunately, I tried many many models and none of them actually summarize the text to as short as one sentence.
252
+ #I had searched for ways to fine tune the summarization model to specify that the summarization should be done in just one sentence but did not find a way to implement it
253
+ from transformers import pipeline
254
+ summarized_text_list_list=summarized_text_list['summary_text']
255
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
256
+ #print(summarizer)
257
+ number_of_sentences=summarized_text_list_list.count('.')
258
+ print(number_of_sentences)
259
+ while(number_of_sentences)>1:
260
+ print(number_of_sentences)
261
+ summarized_text_list_list=summarizer(summarized_text_list_list)[0]['summary_text']
262
+ number_of_sentences-=1
263
+ print(summarized_text_list_list)
264
+ print(number_of_sentences)
265
+
266
+
267
+ #text to speech
268
+ !pip install git+https://github.com/huggingface/transformers.git
269
+ !pip install datasets sentencepiece
270
+ import torch
271
+ import soundfile as sf
272
+ from IPython.display import Audio
273
+ from datasets import load_dataset
274
+ from transformers import pipeline
275
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
276
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
277
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
278
+ #text = "The future belongs to those who believe in the beauty of their dreams."
279
+ #text = (summarized_text_list_list)
280
+
281
+ inputs = processor(text=summarized_text_list_list, return_tensors="pt")
282
+ from datasets import load_dataset
283
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
284
+
285
+ import torch
286
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
287
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
288
+ from transformers import SpeechT5HifiGan
289
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
290
+ with torch.no_grad():
291
+ speech = vocoder(spectrogram)
292
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
293
+ Audio(speech, rate=16000)
294
+
295
+