Spaces:
Sleeping
Sleeping
from PIL import Image | |
import fitz # PyMuPDF | |
import os | |
import pyheif | |
def pdf_to_images(pdf_path): | |
# Ensure the PDF file exists | |
if not os.path.exists(pdf_path): | |
print(f"The file {pdf_path} does not exist.") | |
return [] | |
# Open the PDF file | |
pdf_document = fitz.open(pdf_path) | |
# List to store PIL images | |
images = [] | |
# Process each page | |
for page_num in range(len(pdf_document)): | |
# Get the page | |
page = pdf_document.load_page(page_num) | |
# Convert the page to a PIL image | |
pix = page.get_pixmap() | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# Append the image to the list | |
images.append(img) | |
# Close the PDF document | |
pdf_document.close() | |
return images | |
def heic_to_image(heic_path): | |
# Ensure the HEIC file exists | |
if not os.path.exists(heic_path): | |
print(f"The file {heic_path} does not exist.") | |
return [] | |
if heic_path.endswith(".HEIC"): | |
# Create the new filename by replacing .HEIC with .heic | |
new_file_path = heic_path[:-5] + ".heic" | |
# Rename the file | |
os.rename(heic_path, new_file_path) | |
print(f"Renamed: {heic_path} to {new_file_path}") | |
heic_path = new_file_path | |
try: | |
# Open the HEIC file | |
heif_file = pyheif.read(heic_path) | |
# Convert to a PIL image | |
image = Image.frombytes( | |
heif_file.mode, | |
heif_file.size, | |
heif_file.data, | |
"raw", | |
heif_file.mode, | |
heif_file.stride, | |
) | |
except Exception as e: | |
print(f"An error occurred while processing the HEIC file: {e}") | |
return [] | |
return image | |
def process_image(file_path): | |
if file_path.endswith(".pdf"): | |
images = pdf_to_images(file_path) | |
# Save the images so we can pass their path to the rest of the gradio functions | |
if images: | |
output_dir = "data_processed" | |
os.makedirs(output_dir, exist_ok=True) | |
image_paths = [] | |
label_prefix = file_path.split(os.sep)[-1].split(".")[0] | |
for i, img in enumerate(images, start=1): | |
image_path = os.path.join(output_dir, f"{label_prefix}_page_{i}.png") | |
img.save(image_path, "PNG") | |
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache | |
# replace the .pdf with .png in the file_path | |
file_path = file_path.replace(".pdf", ".png") | |
# img.save(file_path, "PNG") | |
# image_paths.append(image_path) | |
image_paths.append(file_path) | |
print(f"Saved {image_path}") | |
print(f"Saved {file_path}") | |
# saving the first image of the pdf only to be processed in the gradio UI. | |
# TODO: Accomodate for multiple images | |
images[0].save(file_path, "PNG") | |
return file_path, "" | |
# return image_paths[0], "" | |
else: | |
return None, "No image uploaded or invalid file" | |
elif ( | |
file_path.endswith(".png") | |
or file_path.endswith(".jpg") | |
or file_path.endswith(".jpeg") | |
): | |
print( | |
f"file_path from the image processing function for compatible images: {file_path}" | |
) | |
return file_path, "" | |
elif file_path.endswith(".JPEG"): | |
# Create the new filename by replacing .JPEG with .jpeg | |
new_file_path = file_path[:-5] + ".jpeg" | |
# Rename the file | |
os.rename(file_path, new_file_path) | |
print(f"Renamed: {file_path} to {new_file_path}") | |
file_path = new_file_path | |
return file_path, "" | |
elif file_path.endswith(".JPG"): | |
# Create the new filename by replacing .JPG with .jpg | |
new_file_path = file_path[:-4] + ".jpg" | |
# Rename the file | |
os.rename(file_path, new_file_path) | |
print(f"Renamed: {file_path} to {new_file_path}") | |
file_path = new_file_path | |
return file_path, "" | |
elif file_path.endswith(".PNG"): | |
# Create the new filename by replacing .PNG with .png | |
new_file_path = file_path[:-4] + ".png" | |
# Rename the file | |
os.rename(file_path, new_file_path) | |
print(f"Renamed: {file_path} to {new_file_path}") | |
file_path = new_file_path | |
return file_path, "" | |
elif file_path.endswith(".heic") or file_path.endswith(".HEIC"): | |
image = heic_to_image(file_path) | |
output_dir = "data_processed" | |
os.makedirs(output_dir, exist_ok=True) | |
image_path = os.path.join( | |
output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.png" | |
) | |
image.save(image_path, "PNG") | |
# saving it back to the same path assigned by gradio so that we can benefit from gradio's cache | |
image.save(file_path, "PNG") | |
print(f"Saved {image_path}") | |
print(f"Saved {file_path}") | |
# return image_path, "" | |
return file_path, "" | |
else: | |
return None, "No image uploaded or invalid file" | |
# Example usage | |
if __name__ == "__main__": | |
pdf_path = "data/Augustin REMY Mindmap OOP .pdf" | |
images = pdf_to_images(pdf_path) | |
# Example: Save the images if you want to check them | |
if images: | |
output_dir = "data_processed" | |
os.makedirs(output_dir, exist_ok=True) | |
for i, img in enumerate(images, start=1): | |
image_path = os.path.join(output_dir, f"page_{i}.png") | |
img.save(image_path, "PNG") | |
print(f"Saved {image_path}") | |