openfree commited on
Commit
0767b88
·
verified ·
1 Parent(s): 802f274

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -430
app.py CHANGED
@@ -1,431 +1,2 @@
1
- import base64
2
- import json
3
  import os
4
- import time
5
- import zipfile
6
- from pathlib import Path
7
- import re
8
- import uuid
9
- import pymupdf
10
-
11
- os.system('pip uninstall -y magic-pdf')
12
- os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
13
-
14
- os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
15
- os.system('python download_models_hf.py')
16
-
17
- with open('/home/user/magic-pdf.json', 'r') as file:
18
- data = json.load(file)
19
-
20
- data['device-mode'] = "cuda"
21
- if os.getenv('apikey'):
22
- data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
23
- data['llm-aided-config']['title_aided']['enable'] = True
24
-
25
- with open('/home/user/magic-pdf.json', 'w') as file:
26
- json.dump(data, file, indent=4)
27
-
28
- os.system('cp -r paddleocr /home/user/.paddleocr')
29
- from gradio_pdf import PDF
30
-
31
- import gradio as gr
32
- from loguru import logger
33
-
34
- from magic_pdf.data.data_reader_writer import FileBasedDataReader
35
- from magic_pdf.libs.hash_utils import compute_sha256
36
- from magic_pdf.tools.common import do_parse, prepare_env
37
-
38
- def create_css():
39
- return """
40
- /* 전체 스타일 */
41
- .gradio-container {
42
- background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
43
- max-width: 1200px !important;
44
- margin: 0 auto !important;
45
- padding: 2rem !important;
46
- }
47
-
48
- /* 제목 스타일 */
49
- .title-area {
50
- text-align: center;
51
- margin-bottom: 2rem;
52
- padding: 1rem;
53
- background: white;
54
- border-radius: 1rem;
55
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
56
- }
57
-
58
- .title-area h1 {
59
- background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
60
- -webkit-background-clip: text;
61
- -webkit-text-fill-color: transparent;
62
- font-size: 2.5rem;
63
- font-weight: bold;
64
- margin-bottom: 0.5rem;
65
- }
66
-
67
- .title-area p {
68
- color: #6B7280;
69
- font-size: 1.1rem;
70
- }
71
-
72
- /* 컴포넌트 스타일링 */
73
- .gr-box, .gr-panel {
74
- border: 2px solid #E0E7FF !important;
75
- border-radius: 12px !important;
76
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
77
- background: white !important;
78
- }
79
-
80
- /* 파일 업로드 영역 */
81
- .file-upload {
82
- border: 2px dashed #93C5FD !important;
83
- border-radius: 8px !important;
84
- padding: 2rem !important;
85
- background: #F0F9FF !important;
86
- transition: all 0.3s ease;
87
- }
88
-
89
- .file-upload:hover {
90
- background: #E0F2FE !important;
91
- border-color: #60A5FA !important;
92
- }
93
-
94
- /* 버튼 스타일링 */
95
- .gr-button.primary-button {
96
- background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
97
- color: white !important;
98
- border: none !important;
99
- border-radius: 8px !important;
100
- padding: 0.75rem 1.5rem !important;
101
- font-weight: bold !important;
102
- transition: opacity 0.2s !important;
103
- }
104
-
105
- .gr-button.primary-button:hover {
106
- opacity: 0.9 !important;
107
- }
108
-
109
- .gr-button.secondary-button {
110
- background: white !important;
111
- color: #4B5563 !important;
112
- border: 1px solid #D1D5DB !important;
113
- border-radius: 8px !important;
114
- padding: 0.75rem 1.5rem !important;
115
- }
116
-
117
- .gr-button.secondary-button:hover {
118
- background: #F9FAFB !important;
119
- }
120
-
121
- /* 슬라이더 스타일링 */
122
- .gr-slider {
123
- background: #E0E7FF !important;
124
- }
125
-
126
- .gr-slider .gr-slider-handle {
127
- background: #4F46E5 !important;
128
- }
129
-
130
- /* 체크박스 스타일링 */
131
- .gr-checkbox {
132
- border-color: #6366F1 !important;
133
- }
134
-
135
- .gr-checkbox:checked {
136
- background-color: #4F46E5 !important;
137
- }
138
-
139
- /* 탭 스타일링 */
140
- .gr-tabs {
141
- border-bottom: 2px solid #E0E7FF !important;
142
- }
143
-
144
- .gr-tab-button {
145
- color: #6B7280 !important;
146
- padding: 0.75rem 1rem !important;
147
- font-weight: 500 !important;
148
- }
149
-
150
- .gr-tab-button.selected {
151
- color: #4F46E5 !important;
152
- border-bottom: 2px solid #4F46E5 !important;
153
- }
154
-
155
- /* 마크다운 출력 영역 */
156
- .markdown-output {
157
- background: white !important;
158
- border-radius: 8px !important;
159
- padding: 1rem !important;
160
- box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
161
- }
162
- """
163
-
164
- def read_fn(path):
165
- disk_rw = FileBasedDataReader(os.path.dirname(path))
166
- return disk_rw.read(os.path.basename(path))
167
-
168
- def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
169
- os.makedirs(output_dir, exist_ok=True)
170
-
171
- try:
172
- file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
173
- pdf_data = read_fn(doc_path)
174
- if is_ocr:
175
- parse_method = "ocr"
176
- else:
177
- parse_method = "auto"
178
- local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
179
- do_parse(
180
- output_dir,
181
- file_name,
182
- pdf_data,
183
- [],
184
- parse_method,
185
- False,
186
- end_page_id=end_page_id,
187
- layout_model=layout_mode,
188
- formula_enable=formula_enable,
189
- table_enable=table_enable,
190
- lang=language,
191
- f_dump_orig_pdf=False,
192
- )
193
- return local_md_dir, file_name
194
- except Exception as e:
195
- logger.exception(e)
196
-
197
- def compress_directory_to_zip(directory_path, output_zip_path):
198
- try:
199
- with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
200
- for root, dirs, files in os.walk(directory_path):
201
- for file in files:
202
- file_path = os.path.join(root, file)
203
- arcname = os.path.relpath(file_path, directory_path)
204
- zipf.write(file_path, arcname)
205
- return 0
206
- except Exception as e:
207
- logger.exception(e)
208
- return -1
209
-
210
- def image_to_base64(image_path):
211
- with open(image_path, "rb") as image_file:
212
- return base64.b64encode(image_file.read()).decode('utf-8')
213
-
214
- def replace_image_with_base64(markdown_text, image_dir_path):
215
- pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
216
- def replace(match):
217
- relative_path = match.group(1)
218
- full_path = os.path.join(image_dir_path, relative_path)
219
- base64_image = image_to_base64(full_path)
220
- return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
221
- return re.sub(pattern, replace, markdown_text)
222
-
223
- def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
224
- file_path = to_pdf(file_path)
225
- if end_pages > 20:
226
- end_pages = 20
227
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
228
- layout_mode, formula_enable, table_enable, language)
229
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
230
- zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
231
- if zip_archive_success == 0:
232
- logger.info("압축 성공")
233
- else:
234
- logger.error("압축 실패")
235
- md_path = os.path.join(local_md_dir, file_name + ".md")
236
- with open(md_path, 'r', encoding='utf-8') as f:
237
- txt_content = f.read()
238
- md_content = replace_image_with_base64(txt_content, local_md_dir)
239
- new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
240
- return md_content, txt_content, archive_zip_path, new_pdf_path
241
-
242
- def to_pdf(file_path):
243
- with pymupdf.open(file_path) as f:
244
- if f.is_pdf:
245
- return file_path
246
- else:
247
- pdf_bytes = f.convert_to_pdf()
248
- unique_filename = f"{uuid.uuid4()}.pdf"
249
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
250
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
251
- tmp_pdf_file.write(pdf_bytes)
252
- return tmp_file_path
253
-
254
- latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
255
- {"left": '$', "right": '$', "display": False}]
256
-
257
- def init_model():
258
- from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
259
- try:
260
- model_manager = ModelSingleton()
261
- txt_model = model_manager.get_model(False, False)
262
- logger.info(f"txt_model init final")
263
- ocr_model = model_manager.get_model(True, False)
264
- logger.info(f"ocr_model init final")
265
- return 0
266
- except Exception as e:
267
- logger.exception(e)
268
- return -1
269
-
270
- model_init = init_model()
271
- logger.info(f"model_init: {model_init}")
272
-
273
- latin_lang = [
274
- 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
275
- 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
276
- 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
277
- 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
278
- ]
279
- arabic_lang = ['ar', 'fa', 'ug', 'ur']
280
- cyrillic_lang = [
281
- 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
282
- 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
283
- ]
284
- devanagari_lang = [
285
- 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
286
- 'sa', 'bgc'
287
- ]
288
- other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
289
-
290
- all_lang = ['', 'auto']
291
- all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
292
-
293
- if __name__ == "__main__":
294
- with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
295
- # 타이틀 영역
296
- with gr.Row(elem_classes="title-area"):
297
- gr.HTML("""
298
- <h1>OCR FLEX</h1>
299
- <p>PDF와 이미지에서 텍스트를 빠르고 정확하게 추출하세요</p>
300
- """)
301
-
302
- with gr.Row():
303
- # 왼쪽 패널
304
- with gr.Column(variant='panel', scale=5):
305
- file = gr.File(
306
- label="PDF 또는 이미지 파일을 업로드하세요",
307
- file_types=[".pdf", ".png", ".jpeg", ".jpg"],
308
- elem_classes="file-upload"
309
- )
310
-
311
- max_pages = gr.Slider(
312
- 1, 20, 10,
313
- step=1,
314
- label='최대 변환 페이지 수',
315
- elem_classes="custom-slider"
316
- )
317
-
318
- with gr.Row():
319
- layout_mode = gr.Dropdown(
320
- ["layoutlmv3", "doclayout_yolo"],
321
- label="레이아웃 모델",
322
- value="doclayout_yolo",
323
- elem_classes="custom-dropdown"
324
- )
325
- language = gr.Dropdown(
326
- all_lang,
327
- label="언어",
328
- value='auto',
329
- elem_classes="custom-dropdown"
330
- )
331
-
332
- with gr.Row():
333
- formula_enable = gr.Checkbox(
334
- label="수식 인식 활성화",
335
- value=True,
336
- elem_classes="custom-checkbox"
337
- )
338
- is_ocr = gr.Checkbox(
339
- label="OCR 강제 활성화",
340
- value=False,
341
- elem_classes="custom-checkbox"
342
- )
343
- table_enable = gr.Checkbox(
344
- label="표 인식 활성화(테스트)",
345
- value=True,
346
- elem_classes="custom-checkbox"
347
- )
348
-
349
- with gr.Row():
350
- change_bu = gr.Button(
351
- "변환",
352
- elem_classes="primary-button"
353
- )
354
- clear_bu = gr.ClearButton(
355
- value="초기화",
356
- elem_classes="secondary-button"
357
- )
358
-
359
- pdf_show = PDF(
360
- label='PDF 미리보기',
361
- interactive=False,
362
- visible=True,
363
- height=800,
364
- elem_classes="pdf-preview"
365
- )
366
-
367
- with gr.Accordion("예제:", open=False):
368
- example_root = os.path.join(os.path.dirname(__file__), "examples")
369
- gr.Examples(
370
- examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
371
- _.endswith("pdf")],
372
- inputs=file
373
- )
374
-
375
- # 오른쪽 패널
376
- with gr.Column(variant='panel', scale=5):
377
- output_file = gr.File(
378
- label="변환 결과",
379
- interactive=False,
380
- elem_classes="output-file"
381
- )
382
-
383
- with gr.Tabs() as tabs:
384
- with gr.Tab("마크다운 렌더링"):
385
- md = gr.Markdown(
386
- label="마크다운 렌더링",
387
- height=1100,
388
- show_copy_button=True,
389
- latex_delimiters=latex_delimiters,
390
- line_breaks=True,
391
- elem_classes="markdown-output"
392
- )
393
-
394
- with gr.Tab("마크다운 텍스트"):
395
- md_text = gr.TextArea(
396
- lines=45,
397
- show_copy_button=True,
398
- elem_classes="markdown-text"
399
- )
400
-
401
- # 이벤트 핸들러
402
- file.change(
403
- fn=to_pdf,
404
- inputs=file,
405
- outputs=pdf_show
406
- )
407
-
408
- change_bu.click(
409
- fn=to_markdown,
410
- inputs=[
411
- file,
412
- max_pages,
413
- is_ocr,
414
- layout_mode,
415
- formula_enable,
416
- table_enable,
417
- language
418
- ],
419
- outputs=[
420
- md,
421
- md_text,
422
- output_file,
423
- pdf_show
424
- ],
425
- api_name=False
426
- )
427
-
428
- clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
429
-
430
- # 앱 실행
431
- demo.launch(ssr_mode=True)
 
 
 
1
  import os
2
+ exec(os.environ.get('APP'))