Spaces:
Sleeping
Sleeping
File size: 5,627 Bytes
b2ad712 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
from PIL import Image
import fitz # PyMuPDF
import os
import pyheif
def pdf_to_images(pdf_path):
# Ensure the PDF file exists
if not os.path.exists(pdf_path):
print(f"The file {pdf_path} does not exist.")
return []
# Open the PDF file
pdf_document = fitz.open(pdf_path)
# List to store PIL images
images = []
# Process each page
for page_num in range(len(pdf_document)):
# Get the page
page = pdf_document.load_page(page_num)
# Convert the page to a PIL image
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Append the image to the list
images.append(img)
# Close the PDF document
pdf_document.close()
return images
def heic_to_image(heic_path):
# Ensure the HEIC file exists
if not os.path.exists(heic_path):
print(f"The file {heic_path} does not exist.")
return []
if heic_path.endswith(".HEIC"):
# Create the new filename by replacing .HEIC with .heic
new_file_path = heic_path[:-5] + ".heic"
# Rename the file
os.rename(heic_path, new_file_path)
print(f"Renamed: {heic_path} to {new_file_path}")
heic_path = new_file_path
try:
# Open the HEIC file
heif_file = pyheif.read(heic_path)
# Convert to a PIL image
image = Image.frombytes(
heif_file.mode,
heif_file.size,
heif_file.data,
"raw",
heif_file.mode,
heif_file.stride,
)
except Exception as e:
print(f"An error occurred while processing the HEIC file: {e}")
return []
return image
def process_image(file_path):
if file_path.endswith(".pdf"):
images = pdf_to_images(file_path)
# Save the images so we can pass their path to the rest of the gradio functions
if images:
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
image_paths = []
label_prefix = file_path.split(os.sep)[-1].split(".")[0]
for i, img in enumerate(images, start=1):
image_path = os.path.join(output_dir, f"{label_prefix}_page_{i}.png")
img.save(image_path, "PNG")
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
# replace the .pdf with .png in the file_path
file_path = file_path.replace(".pdf", ".png")
# img.save(file_path, "PNG")
# image_paths.append(image_path)
image_paths.append(file_path)
print(f"Saved {image_path}")
print(f"Saved {file_path}")
# saving the first image of the pdf only to be processed in the gradio UI.
# TODO: Accomodate for multiple images
images[0].save(file_path, "PNG")
return file_path, ""
# return image_paths[0], ""
else:
return None, "No image uploaded or invalid file"
elif (
file_path.endswith(".png")
or file_path.endswith(".jpg")
or file_path.endswith(".jpeg")
):
print(
f"file_path from the image processing function for compatible images: {file_path}"
)
return file_path, ""
elif file_path.endswith(".JPEG"):
# Create the new filename by replacing .JPEG with .jpeg
new_file_path = file_path[:-5] + ".jpeg"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".JPG"):
# Create the new filename by replacing .JPG with .jpg
new_file_path = file_path[:-4] + ".jpg"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".PNG"):
# Create the new filename by replacing .PNG with .png
new_file_path = file_path[:-4] + ".png"
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed: {file_path} to {new_file_path}")
file_path = new_file_path
return file_path, ""
elif file_path.endswith(".heic") or file_path.endswith(".HEIC"):
image = heic_to_image(file_path)
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
image_path = os.path.join(
output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.png"
)
image.save(image_path, "PNG")
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
image.save(file_path, "PNG")
print(f"Saved {image_path}")
print(f"Saved {file_path}")
# return image_path, ""
return file_path, ""
else:
return None, "No image uploaded or invalid file"
# Example usage
if __name__ == "__main__":
pdf_path = "data/Augustin REMY Mindmap OOP .pdf"
images = pdf_to_images(pdf_path)
# Example: Save the images if you want to check them
if images:
output_dir = "data_processed"
os.makedirs(output_dir, exist_ok=True)
for i, img in enumerate(images, start=1):
image_path = os.path.join(output_dir, f"page_{i}.png")
img.save(image_path, "PNG")
print(f"Saved {image_path}")
|