Spaces:
Running
Running
add: pdf2images
Browse files- .gitignore +2 -0
- app.py +55 -12
- got_ocr.py +53 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
results/*
|
2 |
+
uploads/*
|
app.py
CHANGED
@@ -1,52 +1,95 @@
|
|
1 |
import os
|
|
|
2 |
import uuid
|
3 |
|
4 |
import fitz # PyMuPDF
|
5 |
import gradio as gr
|
6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
UPLOAD_FOLDER = "./uploads"
|
9 |
RESULTS_FOLDER = "./results"
|
10 |
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def pdf_to_images(pdf_path):
|
13 |
images = []
|
14 |
pdf_document = fitz.open(pdf_path)
|
15 |
for page_num in range(len(pdf_document)):
|
16 |
page = pdf_document.load_page(page_num)
|
17 |
-
|
|
|
|
|
|
|
18 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
|
|
|
|
|
|
|
|
19 |
images.append(img)
|
20 |
pdf_document.close()
|
21 |
return images
|
22 |
|
23 |
|
24 |
def process_pdf(pdf_file):
|
|
|
|
|
|
|
25 |
temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
|
26 |
-
|
|
|
|
|
|
|
27 |
images = pdf_to_images(temp_pdf_path)
|
28 |
os.remove(temp_pdf_path)
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
-
def
|
33 |
-
|
34 |
-
|
|
|
35 |
|
36 |
|
37 |
-
def
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
|
41 |
with gr.Blocks() as demo:
|
42 |
pdf_input = gr.File(label="上传PDF文件")
|
43 |
image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
|
44 |
-
|
|
|
|
|
45 |
|
46 |
pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
|
47 |
-
image_gallery.select(fn=on_image_select, inputs=
|
|
|
48 |
|
49 |
-
#
|
50 |
|
51 |
if __name__ == "__main__":
|
52 |
demo.launch()
|
|
|
1 |
import os
|
2 |
+
import shutil
|
3 |
import uuid
|
4 |
|
5 |
import fitz # PyMuPDF
|
6 |
import gradio as gr
|
7 |
+
from modelscope import AutoModel, AutoTokenizer
|
8 |
+
from PIL import Image, ImageEnhance
|
9 |
+
|
10 |
+
from got_ocr import got_ocr
|
11 |
+
|
12 |
+
# 初始化模型和分词器
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
|
14 |
+
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
|
15 |
+
model = model.eval().cuda()
|
16 |
|
17 |
UPLOAD_FOLDER = "./uploads"
|
18 |
RESULTS_FOLDER = "./results"
|
19 |
|
20 |
+
# 确保必要的文件夹存在
|
21 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
22 |
+
os.makedirs(RESULTS_FOLDER, exist_ok=True)
|
23 |
+
|
24 |
|
25 |
def pdf_to_images(pdf_path):
|
26 |
images = []
|
27 |
pdf_document = fitz.open(pdf_path)
|
28 |
for page_num in range(len(pdf_document)):
|
29 |
page = pdf_document.load_page(page_num)
|
30 |
+
# 进一步增加分辨率和缩放比例
|
31 |
+
zoom = 4 # 增加缩放比例到4
|
32 |
+
mat = fitz.Matrix(zoom, zoom)
|
33 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
34 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
35 |
+
|
36 |
+
# 增加对比度
|
37 |
+
enhancer = ImageEnhance.Contrast(img)
|
38 |
+
img = enhancer.enhance(1.5) # 增加50%的对比度
|
39 |
+
|
40 |
images.append(img)
|
41 |
pdf_document.close()
|
42 |
return images
|
43 |
|
44 |
|
45 |
def process_pdf(pdf_file):
|
46 |
+
if pdf_file is None:
|
47 |
+
return None
|
48 |
+
|
49 |
temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
|
50 |
+
|
51 |
+
# 使用 shutil 复制上传的件到临时位置
|
52 |
+
shutil.copy(pdf_file.name, temp_pdf_path)
|
53 |
+
|
54 |
images = pdf_to_images(temp_pdf_path)
|
55 |
os.remove(temp_pdf_path)
|
56 |
+
|
57 |
+
# 将图像保存为临时文件并返回文件路径列表
|
58 |
+
image_paths = []
|
59 |
+
for i, img in enumerate(images):
|
60 |
+
img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
|
61 |
+
img.save(img_path, "PNG")
|
62 |
+
image_paths.append(img_path)
|
63 |
+
|
64 |
+
return image_paths
|
65 |
|
66 |
|
67 |
+
def on_image_select(evt: gr.SelectData):
|
68 |
+
if evt.index is not None:
|
69 |
+
return evt.index
|
70 |
+
return None
|
71 |
|
72 |
|
73 |
+
def perform_ocr(selected_index, image_paths):
|
74 |
+
if selected_index is not None and image_paths and 0 <= selected_index < len(image_paths):
|
75 |
+
selected_image = image_paths[selected_index][0]
|
76 |
+
# 这里添加OCR处理逻辑
|
77 |
+
return got_ocr(model, selected_image)
|
78 |
+
return "请先选择一个图片"
|
79 |
|
80 |
|
81 |
with gr.Blocks() as demo:
|
82 |
pdf_input = gr.File(label="上传PDF文件")
|
83 |
image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
|
84 |
+
selected_index = gr.State(None)
|
85 |
+
ocr_button = gr.Button("开始OCR识别")
|
86 |
+
ocr_result = gr.Textbox(label="OCR结果")
|
87 |
|
88 |
pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
|
89 |
+
image_gallery.select(fn=on_image_select, inputs=[], outputs=selected_index)
|
90 |
+
ocr_button.click(fn=perform_ocr, inputs=[selected_index, image_gallery], outputs=ocr_result)
|
91 |
|
92 |
+
# 移除了选中图片的显示部分
|
93 |
|
94 |
if __name__ == "__main__":
|
95 |
demo.launch()
|
got_ocr.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
def got_ocr(model, image_path, got_mode="plain texts OCR", fine_grained_mode="", ocr_color="", ocr_box=""):
|
6 |
+
# 执行OCR
|
7 |
+
try:
|
8 |
+
if got_mode == "plain texts OCR":
|
9 |
+
res = model.chat(tokenizer, image_path, ocr_type="ocr")
|
10 |
+
return res, None
|
11 |
+
elif got_mode == "format texts OCR":
|
12 |
+
result_path = f"{os.path.splitext(image_path)[0]}_result.html"
|
13 |
+
res = model.chat(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
|
14 |
+
elif got_mode == "plain multi-crop OCR":
|
15 |
+
res = model.chat_crop(tokenizer, image_path, ocr_type="ocr")
|
16 |
+
return res, None
|
17 |
+
elif got_mode == "format multi-crop OCR":
|
18 |
+
result_path = f"{os.path.splitext(image_path)[0]}_result.html"
|
19 |
+
res = model.chat_crop(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
|
20 |
+
elif got_mode == "plain fine-grained OCR":
|
21 |
+
res = model.chat(tokenizer, image_path, ocr_type="ocr", ocr_box=ocr_box, ocr_color=ocr_color)
|
22 |
+
return res, None
|
23 |
+
elif got_mode == "format fine-grained OCR":
|
24 |
+
result_path = f"{os.path.splitext(image_path)[0]}_result.html"
|
25 |
+
res = model.chat(tokenizer, image_path, ocr_type="format", ocr_box=ocr_box, ocr_color=ocr_color, render=True, save_render_file=result_path)
|
26 |
+
|
27 |
+
# 处理格式化结果
|
28 |
+
if "format" in got_mode and os.path.exists(result_path):
|
29 |
+
with open(result_path, "r") as f:
|
30 |
+
html_content = f.read()
|
31 |
+
encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
|
32 |
+
return res, encoded_html
|
33 |
+
else:
|
34 |
+
return res, None
|
35 |
+
|
36 |
+
except Exception as e:
|
37 |
+
return f"错误: {str(e)}", None
|
38 |
+
|
39 |
+
|
40 |
+
# 使用示例
|
41 |
+
if __name__ == "__main__":
|
42 |
+
from modelscope import AutoModel, AutoTokenizer
|
43 |
+
|
44 |
+
# 初始化模型和分词器
|
45 |
+
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
|
46 |
+
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
|
47 |
+
model = model.eval().cuda()
|
48 |
+
|
49 |
+
image_path = "path/to/your/image.png"
|
50 |
+
result, html = got_ocr(model, image_path, got_mode="format texts OCR")
|
51 |
+
print("OCR结果:", result)
|
52 |
+
if html:
|
53 |
+
print("HTML结果可用")
|