File size: 5,627 Bytes
b2ad712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from PIL import Image
import fitz  # PyMuPDF
import os
import pyheif


def pdf_to_images(pdf_path):
    # Ensure the PDF file exists
    if not os.path.exists(pdf_path):
        print(f"The file {pdf_path} does not exist.")
        return []

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # List to store PIL images
    images = []

    # Process each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)

        # Convert the page to a PIL image
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Append the image to the list
        images.append(img)

    # Close the PDF document
    pdf_document.close()

    return images


def heic_to_image(heic_path):

    # Ensure the HEIC file exists
    if not os.path.exists(heic_path):
        print(f"The file {heic_path} does not exist.")
        return []

    if heic_path.endswith(".HEIC"):
        # Create the new filename by replacing .HEIC with .heic
        new_file_path = heic_path[:-5] + ".heic"
        # Rename the file
        os.rename(heic_path, new_file_path)
        print(f"Renamed: {heic_path} to {new_file_path}")
        heic_path = new_file_path

    try:
        # Open the HEIC file
        heif_file = pyheif.read(heic_path)

        # Convert to a PIL image
        image = Image.frombytes(
            heif_file.mode,
            heif_file.size,
            heif_file.data,
            "raw",
            heif_file.mode,
            heif_file.stride,
        )
    except Exception as e:
        print(f"An error occurred while processing the HEIC file: {e}")
        return []

    return image


def process_image(file_path):

    if file_path.endswith(".pdf"):

        images = pdf_to_images(file_path)

        #  Save the images so we can pass their path to the rest of the gradio functions
        if images:
            output_dir = "data_processed"
            os.makedirs(output_dir, exist_ok=True)
            image_paths = []
            label_prefix = file_path.split(os.sep)[-1].split(".")[0]

            for i, img in enumerate(images, start=1):
                image_path = os.path.join(output_dir, f"{label_prefix}_page_{i}.png")
                img.save(image_path, "PNG")
                # saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
                # replace the .pdf with .png in the file_path
                file_path = file_path.replace(".pdf", ".png")
                # img.save(file_path, "PNG")
                # image_paths.append(image_path)
                image_paths.append(file_path)
                print(f"Saved {image_path}")
                print(f"Saved {file_path}")
            # saving the first image of the pdf only to be processed in the gradio UI.
            # TODO: Accomodate for multiple images
            images[0].save(file_path, "PNG")

            return file_path, ""

            # return image_paths[0], ""
        else:
            return None, "No image uploaded or invalid file"

    elif (
        file_path.endswith(".png")
        or file_path.endswith(".jpg")
        or file_path.endswith(".jpeg")
    ):
        print(
            f"file_path from the image processing function for compatible images: {file_path}"
        )
        return file_path, ""

    elif file_path.endswith(".JPEG"):
        # Create the new filename by replacing .JPEG with .jpeg
        new_file_path = file_path[:-5] + ".jpeg"
        # Rename the file
        os.rename(file_path, new_file_path)
        print(f"Renamed: {file_path} to {new_file_path}")
        file_path = new_file_path
        return file_path, ""

    elif file_path.endswith(".JPG"):
        # Create the new filename by replacing .JPG with .jpg
        new_file_path = file_path[:-4] + ".jpg"
        # Rename the file
        os.rename(file_path, new_file_path)
        print(f"Renamed: {file_path} to {new_file_path}")
        file_path = new_file_path
        return file_path, ""

    elif file_path.endswith(".PNG"):
        # Create the new filename by replacing .PNG with .png
        new_file_path = file_path[:-4] + ".png"
        # Rename the file
        os.rename(file_path, new_file_path)
        print(f"Renamed: {file_path} to {new_file_path}")
        file_path = new_file_path
        return file_path, ""

    elif file_path.endswith(".heic") or file_path.endswith(".HEIC"):
        image = heic_to_image(file_path)

        output_dir = "data_processed"
        os.makedirs(output_dir, exist_ok=True)

        image_path = os.path.join(
            output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.png"
        )
        image.save(image_path, "PNG")
        # saving it back to the same path assigned by gradio so that we can benefit from gradio's cache
        image.save(file_path, "PNG")

        print(f"Saved {image_path}")
        print(f"Saved {file_path}")

        # return image_path, ""
        return file_path, ""

    else:
        return None, "No image uploaded or invalid file"


# Example usage
if __name__ == "__main__":
    pdf_path = "data/Augustin REMY Mindmap OOP .pdf"
    images = pdf_to_images(pdf_path)

    # Example: Save the images if you want to check them
    if images:
        output_dir = "data_processed"
        os.makedirs(output_dir, exist_ok=True)

        for i, img in enumerate(images, start=1):
            image_path = os.path.join(output_dir, f"page_{i}.png")
            img.save(image_path, "PNG")
            print(f"Saved {image_path}")