Spaces:
Running
on
L4
Running
on
L4
Update app.py
Browse files
app.py
CHANGED
@@ -33,14 +33,13 @@ def read_fn(path):
|
|
33 |
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
34 |
|
35 |
|
36 |
-
|
37 |
-
def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
38 |
os.makedirs(output_dir, exist_ok=True)
|
39 |
|
40 |
try:
|
41 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
42 |
pdf_data = read_fn(doc_path)
|
43 |
-
if
|
44 |
parse_method = "ocr"
|
45 |
else:
|
46 |
parse_method = "auto"
|
@@ -53,6 +52,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
|
53 |
parse_method,
|
54 |
False,
|
55 |
end_page_id=end_page_id,
|
|
|
|
|
|
|
|
|
56 |
)
|
57 |
return local_md_dir, file_name
|
58 |
except Exception as e:
|
@@ -104,9 +107,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
|
104 |
return re.sub(pattern, replace, markdown_text)
|
105 |
|
106 |
|
107 |
-
def to_markdown(file_path, end_pages,
|
108 |
# 获取识别的md文件以及压缩包文件路径
|
109 |
-
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1,
|
|
|
110 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
111 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
112 |
if zip_archive_success == 0:
|
@@ -149,6 +153,27 @@ with open("header.html", "r") as file:
|
|
149 |
header = file.read()
|
150 |
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
if __name__ == "__main__":
|
153 |
with gr.Blocks() as demo:
|
154 |
gr.HTML(header)
|
@@ -156,8 +181,14 @@ if __name__ == "__main__":
|
|
156 |
with gr.Column(variant='panel', scale=5):
|
157 |
pdf_show = gr.Markdown()
|
158 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
159 |
-
with gr.Row()
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
change_bu = gr.Button("Convert")
|
162 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
163 |
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
|
@@ -177,7 +208,8 @@ if __name__ == "__main__":
|
|
177 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
178 |
with gr.Tab("Markdown text"):
|
179 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
180 |
-
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr
|
|
|
181 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
182 |
|
183 |
demo.launch()
|
|
|
33 |
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
34 |
|
35 |
|
36 |
+
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
|
|
|
37 |
os.makedirs(output_dir, exist_ok=True)
|
38 |
|
39 |
try:
|
40 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
41 |
pdf_data = read_fn(doc_path)
|
42 |
+
if is_ocr:
|
43 |
parse_method = "ocr"
|
44 |
else:
|
45 |
parse_method = "auto"
|
|
|
52 |
parse_method,
|
53 |
False,
|
54 |
end_page_id=end_page_id,
|
55 |
+
layout_model=layout_mode,
|
56 |
+
formula_enable=formula_enable,
|
57 |
+
table_enable=table_enable,
|
58 |
+
lang=language,
|
59 |
)
|
60 |
return local_md_dir, file_name
|
61 |
except Exception as e:
|
|
|
107 |
return re.sub(pattern, replace, markdown_text)
|
108 |
|
109 |
|
110 |
+
def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
|
111 |
# 获取识别的md文件以及压缩包文件路径
|
112 |
+
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
|
113 |
+
layout_mode, formula_enable, table_enable, language)
|
114 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
115 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
116 |
if zip_archive_success == 0:
|
|
|
153 |
header = file.read()
|
154 |
|
155 |
|
156 |
+
latin_lang = [
|
157 |
+
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
158 |
+
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
159 |
+
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
160 |
+
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
|
161 |
+
]
|
162 |
+
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
163 |
+
cyrillic_lang = [
|
164 |
+
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
165 |
+
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
|
166 |
+
]
|
167 |
+
devanagari_lang = [
|
168 |
+
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
|
169 |
+
'sa', 'bgc'
|
170 |
+
]
|
171 |
+
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
172 |
+
|
173 |
+
all_lang = [""]
|
174 |
+
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
175 |
+
|
176 |
+
|
177 |
if __name__ == "__main__":
|
178 |
with gr.Blocks() as demo:
|
179 |
gr.HTML(header)
|
|
|
181 |
with gr.Column(variant='panel', scale=5):
|
182 |
pdf_show = gr.Markdown()
|
183 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
184 |
+
with gr.Row():
|
185 |
+
layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
|
186 |
+
language = gr.Dropdown(all_lang, label="Language", value="")
|
187 |
+
with gr.Row():
|
188 |
+
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
|
189 |
+
is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
|
190 |
+
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
|
191 |
+
with gr.Row():
|
192 |
change_bu = gr.Button("Convert")
|
193 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
194 |
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
|
|
|
208 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
209 |
with gr.Tab("Markdown text"):
|
210 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
211 |
+
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
212 |
+
outputs=[md, md_text, output_file, pdf_show])
|
213 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
214 |
|
215 |
demo.launch()
|