Mageia commited on
Commit
18def71
·
unverified ·
1 Parent(s): e9361c0

add: pdf2images

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +55 -12
  3. got_ocr.py +53 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ results/*
2
+ uploads/*
app.py CHANGED
@@ -1,52 +1,95 @@
1
  import os
 
2
  import uuid
3
 
4
  import fitz # PyMuPDF
5
  import gradio as gr
6
- from PIL import Image
 
 
 
 
 
 
 
 
7
 
8
  UPLOAD_FOLDER = "./uploads"
9
  RESULTS_FOLDER = "./results"
10
 
 
 
 
 
11
 
12
  def pdf_to_images(pdf_path):
13
  images = []
14
  pdf_document = fitz.open(pdf_path)
15
  for page_num in range(len(pdf_document)):
16
  page = pdf_document.load_page(page_num)
17
- pix = page.get_pixmap()
 
 
 
18
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
 
 
 
 
19
  images.append(img)
20
  pdf_document.close()
21
  return images
22
 
23
 
24
  def process_pdf(pdf_file):
 
 
 
25
  temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
26
- pdf_file.save(temp_pdf_path)
 
 
 
27
  images = pdf_to_images(temp_pdf_path)
28
  os.remove(temp_pdf_path)
29
- return images
 
 
 
 
 
 
 
 
30
 
31
 
32
- def display_images(images):
33
- image_elements = [gr.Image(value=img, type="pil") for img in images]
34
- return gr.Gallery(value=image_elements)
 
35
 
36
 
37
- def on_image_select(image):
38
- return image
 
 
 
 
39
 
40
 
41
  with gr.Blocks() as demo:
42
  pdf_input = gr.File(label="上传PDF文件")
43
  image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
44
- selected_image = gr.Image(label="选中的图片", type="pil")
 
 
45
 
46
  pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
47
- image_gallery.select(fn=on_image_select, inputs=image_gallery, outputs=selected_image)
 
48
 
49
- # 这里可以添加OCR转换功能的相关组件和逻辑
50
 
51
  if __name__ == "__main__":
52
  demo.launch()
 
1
  import os
2
+ import shutil
3
  import uuid
4
 
5
  import fitz # PyMuPDF
6
  import gradio as gr
7
+ from modelscope import AutoModel, AutoTokenizer
8
+ from PIL import Image, ImageEnhance
9
+
10
+ from got_ocr import got_ocr
11
+
12
+ # 初始化模型和分词器
13
+ tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
14
+ model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
15
+ model = model.eval().cuda()
16
 
17
  UPLOAD_FOLDER = "./uploads"
18
  RESULTS_FOLDER = "./results"
19
 
20
+ # 确保必要的文件夹存在
21
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
22
+ os.makedirs(RESULTS_FOLDER, exist_ok=True)
23
+
24
 
25
  def pdf_to_images(pdf_path):
26
  images = []
27
  pdf_document = fitz.open(pdf_path)
28
  for page_num in range(len(pdf_document)):
29
  page = pdf_document.load_page(page_num)
30
+ # 进一步增加分辨率和缩放比例
31
+ zoom = 4 # 增加缩放比例到4
32
+ mat = fitz.Matrix(zoom, zoom)
33
+ pix = page.get_pixmap(matrix=mat, alpha=False)
34
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
35
+
36
+ # 增加对比度
37
+ enhancer = ImageEnhance.Contrast(img)
38
+ img = enhancer.enhance(1.5) # 增加50%的对比度
39
+
40
  images.append(img)
41
  pdf_document.close()
42
  return images
43
 
44
 
45
  def process_pdf(pdf_file):
46
+ if pdf_file is None:
47
+ return None
48
+
49
  temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
50
+
51
+ # 使用 shutil 复制上传的件到临时位置
52
+ shutil.copy(pdf_file.name, temp_pdf_path)
53
+
54
  images = pdf_to_images(temp_pdf_path)
55
  os.remove(temp_pdf_path)
56
+
57
+ # 将图像保存为临时文件并返回文件路径列表
58
+ image_paths = []
59
+ for i, img in enumerate(images):
60
+ img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
61
+ img.save(img_path, "PNG")
62
+ image_paths.append(img_path)
63
+
64
+ return image_paths
65
 
66
 
67
+ def on_image_select(evt: gr.SelectData):
68
+ if evt.index is not None:
69
+ return evt.index
70
+ return None
71
 
72
 
73
+ def perform_ocr(selected_index, image_paths):
74
+ if selected_index is not None and image_paths and 0 <= selected_index < len(image_paths):
75
+ selected_image = image_paths[selected_index][0]
76
+ # 这里添加OCR处理逻辑
77
+ return got_ocr(model, selected_image)
78
+ return "请先选择一个图片"
79
 
80
 
81
  with gr.Blocks() as demo:
82
  pdf_input = gr.File(label="上传PDF文件")
83
  image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
84
+ selected_index = gr.State(None)
85
+ ocr_button = gr.Button("开始OCR识别")
86
+ ocr_result = gr.Textbox(label="OCR结果")
87
 
88
  pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
89
+ image_gallery.select(fn=on_image_select, inputs=[], outputs=selected_index)
90
+ ocr_button.click(fn=perform_ocr, inputs=[selected_index, image_gallery], outputs=ocr_result)
91
 
92
+ # 移除了选中图片的显示部分
93
 
94
  if __name__ == "__main__":
95
  demo.launch()
got_ocr.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+
4
+
5
+ def got_ocr(model, image_path, got_mode="plain texts OCR", fine_grained_mode="", ocr_color="", ocr_box=""):
6
+ # 执行OCR
7
+ try:
8
+ if got_mode == "plain texts OCR":
9
+ res = model.chat(tokenizer, image_path, ocr_type="ocr")
10
+ return res, None
11
+ elif got_mode == "format texts OCR":
12
+ result_path = f"{os.path.splitext(image_path)[0]}_result.html"
13
+ res = model.chat(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
14
+ elif got_mode == "plain multi-crop OCR":
15
+ res = model.chat_crop(tokenizer, image_path, ocr_type="ocr")
16
+ return res, None
17
+ elif got_mode == "format multi-crop OCR":
18
+ result_path = f"{os.path.splitext(image_path)[0]}_result.html"
19
+ res = model.chat_crop(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
20
+ elif got_mode == "plain fine-grained OCR":
21
+ res = model.chat(tokenizer, image_path, ocr_type="ocr", ocr_box=ocr_box, ocr_color=ocr_color)
22
+ return res, None
23
+ elif got_mode == "format fine-grained OCR":
24
+ result_path = f"{os.path.splitext(image_path)[0]}_result.html"
25
+ res = model.chat(tokenizer, image_path, ocr_type="format", ocr_box=ocr_box, ocr_color=ocr_color, render=True, save_render_file=result_path)
26
+
27
+ # 处理格式化结果
28
+ if "format" in got_mode and os.path.exists(result_path):
29
+ with open(result_path, "r") as f:
30
+ html_content = f.read()
31
+ encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
32
+ return res, encoded_html
33
+ else:
34
+ return res, None
35
+
36
+ except Exception as e:
37
+ return f"错误: {str(e)}", None
38
+
39
+
40
+ # 使用示例
41
+ if __name__ == "__main__":
42
+ from modelscope import AutoModel, AutoTokenizer
43
+
44
+ # 初始化模型和分词器
45
+ tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
46
+ model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
47
+ model = model.eval().cuda()
48
+
49
+ image_path = "path/to/your/image.png"
50
+ result, html = got_ocr(model, image_path, got_mode="format texts OCR")
51
+ print("OCR结果:", result)
52
+ if html:
53
+ print("HTML结果可用")