OOP_KG_Transform_POC / any_to_image.py
Zaherrr's picture
Upload 8 files
b2ad712 verified
raw
history blame
5.63 kB
from PIL import Image
import fitz # PyMuPDF
import os
import pyheif
def pdf_to_images(pdf_path):
# Ensure the PDF file exists
if not os.path.exists(pdf_path):
print(f"The file {pdf_path} does not exist.")
return []
# Open the PDF file
pdf_document = fitz.open(pdf_path)
# List to store PIL images
images = []
# Process each page
for page_num in range(len(pdf_document)):
# Get the page
page = pdf_document.load_page(page_num)
# Convert the page to a PIL image
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Append the image to the list
images.append(img)
# Close the PDF document
pdf_document.close()
return images
def heic_to_image(heic_path):
# Ensure the HEIC file exists
if not os.path.exists(heic_path):
print(f"The file {heic_path} does not exist.")
return []
if heic_path.endswith(".HEIC"):
# Create the new filename by replacing .HEIC with .heic
new_file_path = heic_path[:-5] + ".heic"
# Rename the file
os.rename(heic_path, new_file_path)
print(f"Renamed: {heic_path} to {new_file_path}")
heic_path = new_file_path
try:
# Open the HEIC file
heif_file = pyheif.read(heic_path)
# Convert to a PIL image
image = Image.frombytes(
heif_file.mode,
heif_file.size,
heif_file.data,
"raw",
heif_file.mode,
heif_file.stride,
)
except Exception as e:
print(f"An error occurred while processing the HEIC file: {e}")
return []
return image
def process_image(file_path):
if file_path.endswith(".pdf"):
images = pdf_to_images(file_path)
# Save the images so we can pass their path to the rest of the gradio functions
if images:
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
image_paths = []
label_prefix = file_path.split(os.sep)[-1].split(".")[0]
for i, img in enumerate(images, start=1):
image_path = os.path.join(output_dir, f"{label_prefix}_page_{i}.png")
img.save(image_path, "PNG")
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
# replace the .pdf with .png in the file_path
file_path = file_path.replace(".pdf", ".png")
# img.save(file_path, "PNG")
# image_paths.append(image_path)
image_paths.append(file_path)
print(f"Saved {image_path}")
print(f"Saved {file_path}")
# saving the first image of the pdf only to be processed in the gradio UI.
# TODO: Accomodate for multiple images
images[0].save(file_path, "PNG")
return file_path, ""
# return image_paths[0], ""
else:
return None, "No image uploaded or invalid file"
elif (
file_path.endswith(".png")
or file_path.endswith(".jpg")
or file_path.endswith(".jpeg")
):
print(
f"file_path from the image processing function for compatible images: {file_path}"
)
return file_path, ""
elif file_path.endswith(".JPEG"):
# Create the new filename by replacing .JPEG with .jpeg
new_file_path = file_path[:-5] + ".jpeg"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".JPG"):
# Create the new filename by replacing .JPG with .jpg
new_file_path = file_path[:-4] + ".jpg"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".PNG"):
# Create the new filename by replacing .PNG with .png
new_file_path = file_path[:-4] + ".png"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".heic") or file_path.endswith(".HEIC"):
image = heic_to_image(file_path)
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
image_path = os.path.join(
output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.png"
)
image.save(image_path, "PNG")
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
image.save(file_path, "PNG")
print(f"Saved {image_path}")
print(f"Saved {file_path}")
# return image_path, ""
return file_path, ""
else:
return None, "No image uploaded or invalid file"
# Example usage
if __name__ == "__main__":
pdf_path = "data/Augustin REMY Mindmap OOP .pdf"
images = pdf_to_images(pdf_path)
# Example: Save the images if you want to check them
if images:
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
for i, img in enumerate(images, start=1):
image_path = os.path.join(output_dir, f"page_{i}.png")
img.save(image_path, "PNG")
print(f"Saved {image_path}")