Spaces:

cdcvd
/

Ocr_check

Running

App Files Files Community

cdcvd commited on Aug 10, 2024

Commit

c89663f

verified ·

1 Parent(s): b874e98

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -15

app.py CHANGED Viewed

@@ -17,26 +17,57 @@ def trim_whitespace(image):
     return trimmed_image
 def convert_pdf_to_images(pdf_path, zoom=2):
-    pdf_document = fitz.open(pdf_path)
-    images = []
-    for page_num in range(len(pdf_document)):
-        page = pdf_document.load_page(page_num)
-        matrix = fitz.Matrix(zoom, zoom)
-        pix = page.get_pixmap(matrix=matrix)
-        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        trimmed_image = trim_whitespace(image)
-        images.append(trimmed_image)
     return images
-def convert_docx_to_images(docx_path):
-    document = Document(docx_path)
     images = []
-    for image_shape in document.inline_shapes:
-        image_stream = image_shape.image.blob
-        image = Image.open(io.BytesIO(image_stream))
-        images.append(image)
     return images
 def remove_background_from_image(image):
     return remove(image)

     return trimmed_image
 def convert_pdf_to_images(pdf_path, zoom=2):
+    try:
+       pdf_document = fitz.open(pdf_path)
+       name_with_extension = os.path.basename(pdf_path)
+       name = os.path.splitext(name_with_extension)[0]
+       images = []
+       for page_num in range(len(pdf_document)):
+           page = pdf_document.load_page(page_num)
+           matrix = fitz.Matrix(zoom, zoom)
+           pix = page.get_pixmap(matrix=matrix)
+           image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+           trimmed_image = trim_whitespace(image)
+           images.append(trimmed_image)
     return images
+import os
+from io import BytesIO
+from docx import Document
+from PIL import Image
+def convert_docx_to_jpeg(docx_bytes):
+    """
+    Convert each image in a DOCX file to a separate JPEG image and return them as a list.
+    Args:
+    - docx_bytes: The binary content of the DOCX file.
+    Returns:
+    - A list of PIL Image objects in JPEG format.
+    """
+    document = Document(BytesIO(docx_bytes))
     images = []
+    for rel in document.part.rels.values():
+        if "image" in rel.target_ref:
+            image_stream = rel.target_part.blob
+            image = Image.open(BytesIO(image_stream))
+            jpeg_image = BytesIO()
+            image.convert('RGB').save(jpeg_image, format="JPEG")
+            jpeg_image.seek(0)
+            images.append(Image.open(jpeg_image))
     return images
+# Example usage:
+# with open("example.docx", "rb") as f:
+#     docx_bytes = f.read()
+# images = convert_docx_to_jpeg(docx_bytes)
+# for img in images:
+#     img.show()
 def remove_background_from_image(image):
     return remove(image)