Spaces:
Runtime error
Runtime error
File size: 15,382 Bytes
8bbb2aa 0a416bb 2e75ac3 90f1a74 1ef1dd3 3da38ea 1ef1dd3 90f1a74 1ef1dd3 90f1a74 cdc681e 8342da0 de2441d fd5777c 8bbb2aa 1f39b2c 8bbb2aa ad87511 8bbb2aa c63b278 8bbb2aa 0cb41d9 8bbb2aa dda3639 8bbb2aa 1ef1dd3 8bbb2aa 5aa34e5 8bbb2aa 87eb59b 8bbb2aa 87eb59b 670bd75 2cdf061 8bbb2aa 2cdf061 8bbb2aa 90f1a74 6c21fa3 90f1a74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
# https://huggingface.co./spaces/Mishmosh/MichelleAssessment3
# I was having great difficulty getting any code to run without errors. Finally when it was working I ran out of time to complete the task
# The code receives a PDF but doesnt' appear to process it.
# I still need to add the gradio interface output to show the summarized text and play the sound file
import gradio as gr
# Interface for displaying the summarized text
summarized_textbox = gr.Textbox(type="text", label="Summarized Text")
# Interface for playing the speech
speech_audio = gr.Audio(type="file", label="Text-to-Speech Audio", element_id="audio_element")
# Interface to process input and display results
iface = gr.Interface(
fn=process_input,
inputs=[
gr.File(
type="binary",
label="Hello. This app is called Abstract Summariser and gives a one sentence summary of the input PDF in both written and spoken form. Please upload a PDF file that contains an abstract.",
),
],
outputs=[summarized_textbox, speech_audio], # Display the summarized text and audio
)
def process_input(pdf_file):
print("Received PDF File:", pdf_file.name)
# Read the content of the uploaded PDF file
pdf_content = pdf_file.read()
# Save the received PDF content locally
with open("received_pdf.pdf", "wb") as output_file:
output_file.write(pdf_content)
# Return the content of the processed PDF file
return pdf_content
###commented out latest version
#iface = gr.Interface(
# fn=process_input,
# inputs=[
# gr.File(
# type="binary",
# label="Hello. This app is called Abstract Summariser and gives a one sentence summary of the input PDF in both written and spoken form. Please upload a PDF file that contains an abstract.",
# ),
# ],
# outputs=None,
#)
iface.launch(share=True)
#iface.launch()
#python app.py
#python -m pip install --upgrade pip
#pip install torch torchvision torchaudio tensorflow
# Install Rust
#RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
#RUN python -m pip install --upgrade pip
#pip install --upgrade pip
#RUN pip install --no-cache-dir -r requirements.txt
#RUN pip install --use-feature=in-tree-build tokenizers
#!pip install PyPDF2
#!pip install sentencepiece
#!pip install pdfminer.six
#!pip install pdfplumber
#!pip install pdf2image
#!pip install Pillow
#!pip install pytesseract
# @title
#!apt-get install poppler-utils
#!apt install tesseract-ocr
#!apt install libtesseract-dev
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import os
#pdf_path="received_pdf.pdf"
pdf_path=pdf_content
def text_extraction(element):
# Extracting the text from the in-line text element
line_text = element.get_text()
# Find the formats of the text
# Initialize the list with all the formats that appeared in the line of text
line_formats = []
for text_line in element:
if isinstance(text_line, LTTextContainer):
# Iterating through each character in the line of text
for character in text_line:
if isinstance(character, LTChar):
# Append the font name of the character
line_formats.append(character.fontname)
# Append the font size of the character
line_formats.append(character.size)
# Find the unique font sizes and names in the line
format_per_line = list(set(line_formats))
# Return a tuple with the text in each line along with its format
return (line_text, format_per_line)
# @title
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
# Get the coordinates to crop the image from the PDF
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
# Crop the page using coordinates (left, bottom, right, top)
pageObj.mediabox.lower_left = (image_left, image_bottom)
pageObj.mediabox.upper_right = (image_right, image_top)
# Save the cropped page to a new PDF
cropped_pdf_writer = PyPDF2.PdfWriter()
cropped_pdf_writer.add_page(pageObj)
# Save the cropped PDF to a new file
with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
cropped_pdf_writer.write(cropped_pdf_file)
# Create a function to convert the PDF to images
def convert_to_images(input_file,):
images = convert_from_path(input_file)
image = images[0]
output_file = "PDF_image.png"
image.save(output_file, "PNG")
# Create a function to read text from images
def image_to_text(image_path):
# Read the image
img = Image.open(image_path)
# Extract the text from the image
text = pytesseract.image_to_string(img)
return text
# @title
# Extracting tables from the page
def extract_table(pdf_path, page_num, table_num):
# Open the pdf file
# pdf = received_pdf.pdf
pdf = pdf_path
# Find the examined page
table_page = pdf.pages[page_num]
# Extract the appropriate table
table = table_page.extract_tables()[table_num]
return table
# Convert table into the appropriate format
def table_converter(table):
table_string = ''
# Iterate through each row of the table
for row_num in range(len(table)):
row = table[row_num]
# Remove the line breaker from the wrapped texts
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
# Convert the table into a string
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
# Removing the last line break
table_string = table_string[:-1]
return table_string
# @title
def read_pdf(pdf_path):
# create a PDF file object
pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object
#pdfReaded = PyPDF2.PdfReader(pdfFileObj) #coded out as suggested by chatgpt
pdfReaded = PyPDF2.PdfFileReader(pdfFileObj)
# Create the dictionary to extract text from each image
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
print("Elaborating Page_" +str(pagenum))
# Initialize the variables needed for the text extraction from the page
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# Initialize the number of the examined tables
table_num = 0
first_element= True
table_extraction_flag= False
# Open the pdf file
pdf = pdfplumber.open(pdf_path)
# Find the examined page
page_tables = pdf.pages[pagenum]
# Find the number of tables on the page
tables = page_tables.find_tables()
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the elements as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i,component in enumerate(page_elements):
# Extract the position of the top side of the element in the PDF
pos= component[0]
# Extract the element of the page layout
element = component[1]
# Check if the element is a text element
if isinstance(element, LTTextContainer):
# Check if the text appeared in a table
if table_extraction_flag == False:
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
else:
# Omit the text that appeared in a table
pass
# Check the elements for images
if isinstance(element, LTFigure):
# Crop the image from the PDF
crop_image(element, pageObj)
# Convert the cropped pdf to an image
convert_to_images('cropped_image.pdf')
# Extract the text from the image
image_text = image_to_text('PDF_image.png')
text_from_images.append(image_text)
page_content.append(image_text)
# Add a placeholder in the text and format lists
page_text.append('image')
line_format.append('image')
# Check the elements for tables
if isinstance(element, LTRect):
# If the first rectangular element
if first_element == True and (table_num+1) <= len(tables):
# Find the bounding box of the table
lower_side = page.bbox[3] - tables[table_num].bbox[3]
upper_side = element.y1
# Extract the information from the table
table = extract_table(pdf_path, pagenum, table_num)
# Convert the table information in structured string format
table_string = table_converter(table)
# Append the table string into a list
text_from_tables.append(table_string)
page_content.append(table_string)
# Set the flag as True to avoid the content again
table_extraction_flag = True
# Make it another element
first_element = False
# Add a placeholder in the text and format lists
page_text.append('table')
line_format.append('table')
# Check if we already extracted the tables from the page
if element.y0 >= lower_side and element.y1 <= upper_side:
pass
elif not isinstance(page_elements[i+1][1], LTRect):
table_extraction_flag = False
first_element = True
table_num+=1
# Create the key of the dictionary
dctkey = 'Page_'+str(pagenum)
# Add the list of list as the value of the page key
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
# Closing the pdf file object
pdfFileObj.close()
# Deleting the additional files created
#os.remove('cropped_image.pdf')
#os.remove('PDF_image.png')
return text_per_page
#google drive
#from google.colab import drive
#drive.mount('/content/drive')
#read PDF
#pdf_path = 'test.pdf' #article 11
#pdf_path = 'https://huggingface.co./spaces/Mishmosh/MichelleAssessment3/blob/main/test.pdf' #article 11
#text_per_page = read_pdf(received_pdf.pdf)
text_per_page = read_pdf(pdf_content)
# This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
#to try this here since the formatting of the text has already been removed.
# Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
abstract_from_pdf='' # define empty variable that will hold the text from the abstract
found_abstract=False # has the abstract been found
for key in text_per_page.keys(): # go through keys in dictionary
current_item=text_per_page[key] #current key
for paragraphs in current_item: #go through each item
for index,paragraph in enumerate(paragraphs): #go through each line
if 'Abstract\n' == paragraph: #does line match paragraph
found_abstract=True #word abstract has been found
abstract_from_pdf=paragraphs[index+1] #get next paragraph
if found_abstract: #if abstract found
break
print(abstract_from_pdf)
from transformers import pipeline
summarizer = pipeline("summarization", model="ainize/bart-base-cnn")
#summarizer = pipeline("summarization", model="linydub/bart-large-samsum") # various models were tried and the best one was selected
#summarizer = pipeline("summarization", model="slauw87/bart_summarisation")
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer = pipeline("summarization", model="google/pegasus-cnn_dailymail")
#print(summarizer(abstract_from_pdf, max_length=50, min_length=5, do_sample=False))
summarized_text=(summarizer(abstract_from_pdf))
print(summarized_text)
#summary_of_abstract=str(summarizer)
#type(summary_of_abstract)
#print(summary_of_abstract)
# the aim of this section of code is to get a summary of just one sentence by summarizing the summary all while the summary is longer than one sentence.
# unfortunately, I tried many many models and none of them actually summarize the text to as short as one sentence.
#I had searched for ways to fine tune the summarization model to specify that the summarization should be done in just one sentence but did not find a way to implement it
#from transformers import pipeline
#summarized_text_list_list=summarized_text_list['summary_text']
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#print(summarizer)
#number_of_sentences=summarized_text_list_list.count('.')
#print(number_of_sentences)
#while(number_of_sentences)>1:
# print(number_of_sentences)
# summarized_text_list_list=summarizer(summarized_text_list_list)[0]['summary_text']
# number_of_sentences-=1
#print(summarized_text_list_list)
#print(number_of_sentences)
#text to speech
#!pip install git+https://github.com/huggingface/transformers.git
#!pip install datasets sentencepiece
import torch
#import soundfile as sf
#from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
#text = "The future belongs to those who believe in the beauty of their dreams."
#text = (summarized_text_list_list)
text = (summarized_text)
#inputs = processor(text=summarized_text_list_list, return_tensors="pt")
#inputs = processor("Michelletest", return_tensors="pt")
inputs = processor(text, return_tensors="pt")
from datasets import load_dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
import torch
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
from transformers import SpeechT5HifiGan
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
speech = vocoder(spectrogram)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
Audio(speech, rate=16000)
#new code
summarized_text = summarize_abstract(abstract_from_pdf)
# Set the value of the summarized_textbox
summarized_textbox.value = summarized_text
speech_audio.file = audio_path
|