Raghuan commited on
Commit
31d9dba
·
verified ·
1 Parent(s): e370c3a

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +53 -0
utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ from PIL import Image
3
+ import io
4
+ import gradio as gr
5
+ import base64
6
+ import pandas as pd
7
+ import pymupdf
8
+
9
+
10
+ def image_to_bytes(image):
11
+ img_byte_arr = io.BytesIO()
12
+ image.save(img_byte_arr, format="PNG")
13
+ return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
14
+
15
+
16
+ def extract_pdfs(docs, doc_collection):
17
+ if docs:
18
+ doc_collection = []
19
+ doc_collection.extend(docs)
20
+ return (
21
+ doc_collection,
22
+ gr.Tabs(selected=1),
23
+ pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
24
+ )
25
+
26
+
27
+ def extract_images(docs):
28
+ images = []
29
+ for doc_path in docs:
30
+ doc = pymupdf.open(doc_path)
31
+
32
+ for page_index in range(len(doc)):
33
+ page = doc[page_index]
34
+ image_list = page.get_images()
35
+
36
+ for _, img in enumerate(image_list, start=1):
37
+ xref = img[0]
38
+ pix = pymupdf.Pixmap(doc, xref)
39
+
40
+ if pix.n - pix.alpha > 3:
41
+ pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
42
+
43
+ images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
44
+ return images
45
+
46
+
47
+ def clean_text(text):
48
+ text = text.strip()
49
+ cleaned_text = text.replace("\n", " ")
50
+ cleaned_text = cleaned_text.replace("\t", " ")
51
+ cleaned_text = cleaned_text.replace(" ", " ")
52
+ cleaned_text = cleaned_text.strip()
53
+ return cleaned_text