myhloli commited on
Commit
a011e6d
·
verified ·
1 Parent(s): ac89f00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -8
app.py CHANGED
@@ -33,14 +33,13 @@ def read_fn(path):
33
  return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
34
 
35
 
36
- # @spaces.GPU
37
- def parse_pdf(doc_path, output_dir, end_page_id, ocr):
38
  os.makedirs(output_dir, exist_ok=True)
39
 
40
  try:
41
  file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
42
  pdf_data = read_fn(doc_path)
43
- if ocr:
44
  parse_method = "ocr"
45
  else:
46
  parse_method = "auto"
@@ -53,6 +52,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, ocr):
53
  parse_method,
54
  False,
55
  end_page_id=end_page_id,
 
 
 
 
56
  )
57
  return local_md_dir, file_name
58
  except Exception as e:
@@ -104,9 +107,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
104
  return re.sub(pattern, replace, markdown_text)
105
 
106
 
107
- def to_markdown(file_path, end_pages, ocr):
108
  # 获取识别的md文件以及压缩包文件路径
109
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, ocr)
 
110
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
111
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
112
  if zip_archive_success == 0:
@@ -149,6 +153,27 @@ with open("header.html", "r") as file:
149
  header = file.read()
150
 
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  if __name__ == "__main__":
153
  with gr.Blocks() as demo:
154
  gr.HTML(header)
@@ -156,8 +181,14 @@ if __name__ == "__main__":
156
  with gr.Column(variant='panel', scale=5):
157
  pdf_show = gr.Markdown()
158
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
159
- with gr.Row() as bu_flow:
160
- is_ocr = gr.Checkbox(label="Force enable OCR")
 
 
 
 
 
 
161
  change_bu = gr.Button("Convert")
162
  clear_bu = gr.ClearButton([pdf_show], value="Clear")
163
  pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
@@ -177,7 +208,8 @@ if __name__ == "__main__":
177
  latex_delimiters=latex_delimiters, line_breaks=True)
178
  with gr.Tab("Markdown text"):
179
  md_text = gr.TextArea(lines=45, show_copy_button=True)
180
- change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
 
181
  clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
182
 
183
  demo.launch()
 
33
  return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
34
 
35
 
36
+ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
 
37
  os.makedirs(output_dir, exist_ok=True)
38
 
39
  try:
40
  file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
41
  pdf_data = read_fn(doc_path)
42
+ if is_ocr:
43
  parse_method = "ocr"
44
  else:
45
  parse_method = "auto"
 
52
  parse_method,
53
  False,
54
  end_page_id=end_page_id,
55
+ layout_model=layout_mode,
56
+ formula_enable=formula_enable,
57
+ table_enable=table_enable,
58
+ lang=language,
59
  )
60
  return local_md_dir, file_name
61
  except Exception as e:
 
107
  return re.sub(pattern, replace, markdown_text)
108
 
109
 
110
+ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
111
  # 获取识别的md文件以及压缩包文件路径
112
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
113
+ layout_mode, formula_enable, table_enable, language)
114
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
115
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
116
  if zip_archive_success == 0:
 
153
  header = file.read()
154
 
155
 
156
+ latin_lang = [
157
+ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
158
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
159
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
160
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
161
+ ]
162
+ arabic_lang = ['ar', 'fa', 'ug', 'ur']
163
+ cyrillic_lang = [
164
+ 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
165
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
166
+ ]
167
+ devanagari_lang = [
168
+ 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
169
+ 'sa', 'bgc'
170
+ ]
171
+ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
172
+
173
+ all_lang = [""]
174
+ all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
175
+
176
+
177
  if __name__ == "__main__":
178
  with gr.Blocks() as demo:
179
  gr.HTML(header)
 
181
  with gr.Column(variant='panel', scale=5):
182
  pdf_show = gr.Markdown()
183
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
184
+ with gr.Row():
185
+ layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
186
+ language = gr.Dropdown(all_lang, label="Language", value="")
187
+ with gr.Row():
188
+ formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
189
+ is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
190
+ table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
191
+ with gr.Row():
192
  change_bu = gr.Button("Convert")
193
  clear_bu = gr.ClearButton([pdf_show], value="Clear")
194
  pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
 
208
  latex_delimiters=latex_delimiters, line_breaks=True)
209
  with gr.Tab("Markdown text"):
210
  md_text = gr.TextArea(lines=45, show_copy_button=True)
211
+ change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
212
+ outputs=[md, md_text, output_file, pdf_show])
213
  clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
214
 
215
  demo.launch()