Spaces:

Raghuan
/

PDF_chatbot

Sleeping

Raghuan commited on Aug 11, 2024

Commit

31d9dba

verified ·

1 Parent(s): e370c3a

Upload utils.py

Files changed (1) hide show

utils.py ADDED Viewed

+import pymupdf
+from PIL import Image
+import io
+import gradio as gr
+import base64
+import pandas as pd
+import pymupdf
+def image_to_bytes(image):
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="PNG")
+    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+def extract_pdfs(docs, doc_collection):
+    if docs:
+        doc_collection = []
+        doc_collection.extend(docs)
+    return (
+        doc_collection,
+        gr.Tabs(selected=1),
+        pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
+    )
+def extract_images(docs):
+    images = []
+    for doc_path in docs:
+        doc = pymupdf.open(doc_path)
+        for page_index in range(len(doc)):
+            page = doc[page_index]
+            image_list = page.get_images()
+            for _, img in enumerate(image_list, start=1):
+                xref = img[0]
+                pix = pymupdf.Pixmap(doc, xref)
+                if pix.n - pix.alpha > 3:
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
+    return images
+def clean_text(text):
+    text = text.strip()
+    cleaned_text = text.replace("\n", " ")
+    cleaned_text = cleaned_text.replace("\t", " ")
+    cleaned_text = cleaned_text.replace("  ", " ")
+    cleaned_text = cleaned_text.strip()
+    return cleaned_text