Spaces:
Sleeping
Sleeping
Upload utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
from PIL import Image
|
3 |
+
import io
|
4 |
+
import gradio as gr
|
5 |
+
import base64
|
6 |
+
import pandas as pd
|
7 |
+
import pymupdf
|
8 |
+
|
9 |
+
|
10 |
+
def image_to_bytes(image):
|
11 |
+
img_byte_arr = io.BytesIO()
|
12 |
+
image.save(img_byte_arr, format="PNG")
|
13 |
+
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
14 |
+
|
15 |
+
|
16 |
+
def extract_pdfs(docs, doc_collection):
|
17 |
+
if docs:
|
18 |
+
doc_collection = []
|
19 |
+
doc_collection.extend(docs)
|
20 |
+
return (
|
21 |
+
doc_collection,
|
22 |
+
gr.Tabs(selected=1),
|
23 |
+
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def extract_images(docs):
|
28 |
+
images = []
|
29 |
+
for doc_path in docs:
|
30 |
+
doc = pymupdf.open(doc_path)
|
31 |
+
|
32 |
+
for page_index in range(len(doc)):
|
33 |
+
page = doc[page_index]
|
34 |
+
image_list = page.get_images()
|
35 |
+
|
36 |
+
for _, img in enumerate(image_list, start=1):
|
37 |
+
xref = img[0]
|
38 |
+
pix = pymupdf.Pixmap(doc, xref)
|
39 |
+
|
40 |
+
if pix.n - pix.alpha > 3:
|
41 |
+
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
42 |
+
|
43 |
+
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
|
44 |
+
return images
|
45 |
+
|
46 |
+
|
47 |
+
def clean_text(text):
|
48 |
+
text = text.strip()
|
49 |
+
cleaned_text = text.replace("\n", " ")
|
50 |
+
cleaned_text = cleaned_text.replace("\t", " ")
|
51 |
+
cleaned_text = cleaned_text.replace(" ", " ")
|
52 |
+
cleaned_text = cleaned_text.strip()
|
53 |
+
return cleaned_text
|