yonigozlan HF staff commited on
Commit
2cb5324
·
1 Parent(s): add5814

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ latex.png filter=lfs diff=lfs merge=lfs -text
37
+ multi_box.png filter=lfs diff=lfs merge=lfs -text
38
+ sheet_music.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import re
4
+ import shutil
5
+ import time
6
+ import uuid
7
+ from pathlib import Path
8
+
9
+ import cv2
10
+ import gradio as gr
11
+ import numpy as np
12
+ import spaces
13
+ import torch
14
+ from globe import description, title
15
+ from PIL import Image
16
+ from render import render_ocr_text
17
+
18
+ from transformers import AutoModelForImageTextToText, AutoProcessor
19
+ from transformers.image_utils import load_image
20
+
21
+ model_name = "yonigozlan/GOT-OCR-2.0-hf"
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ processor = AutoProcessor.from_pretrained(model_name)
26
+ model = AutoModelForImageTextToText.from_pretrained(
27
+ model_name, low_cpu_mem_usage=True, device_map=device
28
+ )
29
+ model = model.eval().to(device)
30
+
31
+ UPLOAD_FOLDER = "./uploads"
32
+ RESULTS_FOLDER = "./results"
33
+ stop_str = "<|im_end|>"
34
+ for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
35
+ if not os.path.exists(folder):
36
+ os.makedirs(folder)
37
+
38
+ input_index = 0
39
+
40
+
41
+ @spaces.GPU()
42
+ def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None):
43
+ if image is None:
44
+ return "Error: No image provided", None, None
45
+
46
+ unique_id = str(uuid.uuid4())
47
+ image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.png")
48
+ result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}.html")
49
+ try:
50
+ if not isinstance(image, (tuple, list)):
51
+ image = [image]
52
+ else:
53
+ image = [img[0] for img in image]
54
+ for i, img in enumerate(image):
55
+ if isinstance(img, dict):
56
+ composite_image = img.get("composite")
57
+ if composite_image is not None:
58
+ if isinstance(composite_image, np.ndarray):
59
+ cv2.imwrite(
60
+ image_path, cv2.cvtColor(composite_image, cv2.COLOR_RGB2BGR)
61
+ )
62
+ elif isinstance(composite_image, Image.Image):
63
+ composite_image.save(image_path)
64
+ else:
65
+ return (
66
+ "Error: Unsupported image format from ImageEditor",
67
+ None,
68
+ None,
69
+ )
70
+ else:
71
+ return (
72
+ "Error: No composite image found in ImageEditor output",
73
+ None,
74
+ None,
75
+ )
76
+ elif isinstance(img, np.ndarray):
77
+ cv2.imwrite(image_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
78
+ elif isinstance(img, str):
79
+ shutil.copy(img, image_path)
80
+ else:
81
+ return "Error: Unsupported image format", None, None
82
+
83
+ image[i] = load_image(image_path)
84
+
85
+ if task == "Plain Text OCR":
86
+ inputs = processor(image, return_tensors="pt").to("cuda")
87
+ generate_ids = model.generate(
88
+ **inputs,
89
+ do_sample=False,
90
+ tokenizer=processor.tokenizer,
91
+ stop_strings=stop_str,
92
+ max_new_tokens=4096,
93
+ )
94
+ res = processor.decode(
95
+ generate_ids[0, inputs["input_ids"].shape[1] :],
96
+ skip_special_tokens=True,
97
+ )
98
+ return res, None, unique_id
99
+ else:
100
+ if task == "Format Text OCR":
101
+ inputs = processor(image, return_tensors="pt", format=True).to("cuda")
102
+ generate_ids = model.generate(
103
+ **inputs,
104
+ do_sample=False,
105
+ tokenizer=processor.tokenizer,
106
+ stop_strings=stop_str,
107
+ max_new_tokens=4096,
108
+ )
109
+ res = processor.decode(
110
+ generate_ids[0, inputs["input_ids"].shape[1] :],
111
+ skip_special_tokens=True,
112
+ )
113
+ ocr_type = "format"
114
+ elif task == "Fine-grained OCR (Box)":
115
+ inputs = processor(image, return_tensors="pt", box=ocr_box).to("cuda")
116
+ generate_ids = model.generate(
117
+ **inputs,
118
+ do_sample=False,
119
+ tokenizer=processor.tokenizer,
120
+ stop_strings=stop_str,
121
+ max_new_tokens=4096,
122
+ )
123
+ res = processor.decode(
124
+ generate_ids[0, inputs["input_ids"].shape[1] :],
125
+ skip_special_tokens=True,
126
+ )
127
+ elif task == "Fine-grained OCR (Color)":
128
+ inputs = processor(image, return_tensors="pt", color=ocr_color).to(
129
+ "cuda"
130
+ )
131
+ generate_ids = model.generate(
132
+ **inputs,
133
+ do_sample=False,
134
+ tokenizer=processor.tokenizer,
135
+ stop_strings=stop_str,
136
+ max_new_tokens=4096,
137
+ )
138
+ res = processor.decode(
139
+ generate_ids[0, inputs["input_ids"].shape[1] :],
140
+ skip_special_tokens=True,
141
+ )
142
+ elif task == "Multi-crop OCR":
143
+ inputs = processor(
144
+ image,
145
+ return_tensors="pt",
146
+ format=True,
147
+ crop_to_patches=True,
148
+ max_patches=5,
149
+ ).to("cuda")
150
+ generate_ids = model.generate(
151
+ **inputs,
152
+ do_sample=False,
153
+ tokenizer=processor.tokenizer,
154
+ stop_strings=stop_str,
155
+ max_new_tokens=4096,
156
+ )
157
+ res = processor.decode(
158
+ generate_ids[0, inputs["input_ids"].shape[1] :],
159
+ skip_special_tokens=True,
160
+ )
161
+ ocr_type = "format"
162
+ elif task == "Multi-page OCR":
163
+ inputs = processor(
164
+ image, return_tensors="pt", multi_page=True, format=True
165
+ ).to("cuda")
166
+ generate_ids = model.generate(
167
+ **inputs,
168
+ do_sample=False,
169
+ tokenizer=processor.tokenizer,
170
+ stop_strings=stop_str,
171
+ max_new_tokens=4096,
172
+ )
173
+ res = processor.decode(
174
+ generate_ids[0, inputs["input_ids"].shape[1] :],
175
+ skip_special_tokens=True,
176
+ )
177
+ ocr_type = "format"
178
+
179
+ render_ocr_text(res, result_path, format_text=ocr_type == "format")
180
+ if os.path.exists(result_path):
181
+ with open(result_path, "r") as f:
182
+ html_content = f.read()
183
+ return res, html_content, unique_id
184
+ else:
185
+ return res, None, unique_id
186
+ except Exception as e:
187
+ return f"Error: {str(e)}", None, None
188
+ finally:
189
+ if os.path.exists(image_path):
190
+ os.remove(image_path)
191
+
192
+
193
+ def update_image_input(task):
194
+ if task == "Fine-grained OCR (Color)":
195
+ return (
196
+ gr.update(visible=False),
197
+ gr.update(visible=True),
198
+ gr.update(visible=True),
199
+ gr.update(visible=False),
200
+ gr.update(visible=False),
201
+ )
202
+ elif task == "Multi-page OCR":
203
+ return (
204
+ gr.update(visible=False),
205
+ gr.update(visible=False),
206
+ gr.update(visible=False),
207
+ gr.update(visible=True),
208
+ gr.update(visible=True),
209
+ )
210
+ else:
211
+ return (
212
+ gr.update(visible=True),
213
+ gr.update(visible=False),
214
+ gr.update(visible=False),
215
+ gr.update(visible=False),
216
+ gr.update(visible=False),
217
+ )
218
+
219
+
220
+ def update_inputs(task):
221
+ if task in [
222
+ "Plain Text OCR",
223
+ "Format Text OCR",
224
+ "Multi-crop OCR",
225
+ ]:
226
+ return [
227
+ gr.update(visible=False),
228
+ gr.update(visible=False),
229
+ gr.update(visible=False),
230
+ gr.update(visible=True),
231
+ gr.update(visible=False),
232
+ gr.update(visible=True),
233
+ gr.update(visible=False),
234
+ gr.update(visible=False),
235
+ gr.update(visible=False),
236
+ ]
237
+ elif task == "Fine-grained OCR (Box)":
238
+ return [
239
+ gr.update(visible=True, choices=["ocr", "format"]),
240
+ gr.update(visible=True),
241
+ gr.update(visible=False),
242
+ gr.update(visible=True),
243
+ gr.update(visible=False),
244
+ gr.update(visible=True),
245
+ gr.update(visible=False),
246
+ gr.update(visible=False),
247
+ gr.update(visible=False),
248
+ ]
249
+ elif task == "Fine-grained OCR (Color)":
250
+ return [
251
+ gr.update(visible=True, choices=["ocr", "format"]),
252
+ gr.update(visible=False),
253
+ gr.update(visible=True, choices=["red", "green", "blue"]),
254
+ gr.update(visible=False),
255
+ gr.update(visible=True),
256
+ gr.update(visible=False),
257
+ gr.update(visible=True),
258
+ gr.update(visible=False),
259
+ gr.update(visible=False),
260
+ ]
261
+ elif task == "Multi-page OCR":
262
+ return [
263
+ gr.update(visible=False),
264
+ gr.update(visible=False),
265
+ gr.update(visible=False),
266
+ gr.update(visible=False),
267
+ gr.update(visible=False),
268
+ gr.update(visible=False),
269
+ gr.update(visible=False),
270
+ gr.update(visible=True),
271
+ gr.update(visible=True),
272
+ ]
273
+
274
+
275
+ def parse_latex_output(res):
276
+ # Split the input, preserving newlines and empty lines
277
+ lines = re.split(r"(\$\$.*?\$\$)", res, flags=re.DOTALL)
278
+ parsed_lines = []
279
+ in_latex = False
280
+ latex_buffer = []
281
+
282
+ for line in lines:
283
+ if line == "\n":
284
+ if in_latex:
285
+ latex_buffer.append(line)
286
+ else:
287
+ parsed_lines.append(line)
288
+ continue
289
+
290
+ line = line.strip()
291
+
292
+ latex_patterns = [r"\{", r"\}", r"\[", r"\]", r"\\", r"\$", r"_", r"^", r'"']
293
+ contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
294
+
295
+ if contains_latex:
296
+ if not in_latex:
297
+ in_latex = True
298
+ latex_buffer = ["$$"]
299
+ latex_buffer.append(line)
300
+ else:
301
+ if in_latex:
302
+ latex_buffer.append("$$")
303
+ parsed_lines.extend(latex_buffer)
304
+ in_latex = False
305
+ latex_buffer = []
306
+ parsed_lines.append(line)
307
+
308
+ if in_latex:
309
+ latex_buffer.append("$$")
310
+ parsed_lines.extend(latex_buffer)
311
+
312
+ return "$$\\$$\n".join(parsed_lines)
313
+
314
+
315
+ def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
316
+ res, html_content, unique_id = process_image(
317
+ image, task, ocr_type, ocr_box, ocr_color
318
+ )
319
+
320
+ if isinstance(res, str) and res.startswith("Error:"):
321
+ return res, None
322
+
323
+ res = res.replace("\\title", "\\title ")
324
+ formatted_res = res
325
+ # formatted_res = parse_latex_output(res)
326
+
327
+ if html_content:
328
+ encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
329
+ iframe_src = f"data:text/html;base64,{encoded_html}"
330
+ iframe = f'<iframe src="{iframe_src}" width="100%" height="600px"></iframe>'
331
+ download_link = f'<a href="data:text/html;base64,{encoded_html}" download="result_{unique_id}.html">Download Full Result</a>'
332
+ return formatted_res, f"{download_link}<br>{iframe}"
333
+ return formatted_res, None
334
+
335
+
336
+ def cleanup_old_files():
337
+ current_time = time.time()
338
+ for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
339
+ for file_path in Path(folder).glob("*"):
340
+ if current_time - file_path.stat().st_mtime > 3600: # 1 hour
341
+ file_path.unlink()
342
+
343
+
344
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
345
+ gr.Markdown(title)
346
+ gr.Markdown(description)
347
+
348
+ with gr.Row():
349
+ with gr.Column(scale=1):
350
+ with gr.Group():
351
+ image_input = gr.Image(type="filepath", label="Input Image")
352
+ gallery_input = gr.Gallery(
353
+ type="filepath", label="Input images", visible=False
354
+ )
355
+ image_editor = gr.ImageEditor(
356
+ label="Image Editor", type="pil", visible=False
357
+ )
358
+ task_dropdown = gr.Dropdown(
359
+ choices=[
360
+ "Plain Text OCR",
361
+ "Format Text OCR",
362
+ "Fine-grained OCR (Box)",
363
+ "Fine-grained OCR (Color)",
364
+ "Multi-crop OCR",
365
+ "Multi-page OCR",
366
+ ],
367
+ label="Select Task",
368
+ value="Plain Text OCR",
369
+ )
370
+ ocr_type_dropdown = gr.Dropdown(
371
+ choices=["ocr", "format"], label="OCR Type", visible=False
372
+ )
373
+ ocr_box_input = gr.Textbox(
374
+ label="OCR Box (x1,y1,x2,y2)",
375
+ placeholder="[100,100,200,200]",
376
+ visible=False,
377
+ )
378
+ ocr_color_dropdown = gr.Dropdown(
379
+ choices=["red", "green", "blue"], label="OCR Color", visible=False
380
+ )
381
+ # with gr.Row():
382
+ # max_new_tokens_slider = gr.Slider(50, 500, step=10, value=150, label="Max New Tokens")
383
+ # no_repeat_ngram_size_slider = gr.Slider(1, 10, step=1, value=2, label="No Repeat N-gram Size")
384
+
385
+ submit_button = gr.Button("Process")
386
+ editor_submit_button = gr.Button("Process Edited Image", visible=False)
387
+ gallery_submit_button = gr.Button(
388
+ "Process Multiple Images", visible=False
389
+ )
390
+
391
+ with gr.Column(scale=1):
392
+ with gr.Group():
393
+ output_markdown = gr.Textbox(label="Text output")
394
+ output_html = gr.HTML(label="HTML output")
395
+
396
+ input_types = [
397
+ image_input,
398
+ image_editor,
399
+ gallery_input,
400
+ ]
401
+
402
+ task_dropdown.change(
403
+ update_inputs,
404
+ inputs=[task_dropdown],
405
+ outputs=[
406
+ ocr_type_dropdown,
407
+ ocr_box_input,
408
+ ocr_color_dropdown,
409
+ image_input,
410
+ image_editor,
411
+ submit_button,
412
+ editor_submit_button,
413
+ gallery_input,
414
+ gallery_submit_button,
415
+ ],
416
+ )
417
+
418
+ task_dropdown.change(
419
+ update_image_input,
420
+ inputs=[task_dropdown],
421
+ outputs=[
422
+ image_input,
423
+ image_editor,
424
+ editor_submit_button,
425
+ gallery_input,
426
+ gallery_submit_button,
427
+ ],
428
+ )
429
+
430
+ submit_button.click(
431
+ ocr_demo,
432
+ inputs=[
433
+ image_input,
434
+ task_dropdown,
435
+ ocr_type_dropdown,
436
+ ocr_box_input,
437
+ ocr_color_dropdown,
438
+ ],
439
+ outputs=[output_markdown, output_html],
440
+ )
441
+ editor_submit_button.click(
442
+ ocr_demo,
443
+ inputs=[
444
+ image_editor,
445
+ task_dropdown,
446
+ ocr_type_dropdown,
447
+ ocr_box_input,
448
+ ocr_color_dropdown,
449
+ ],
450
+ outputs=[output_markdown, output_html],
451
+ )
452
+ gallery_submit_button.click(
453
+ ocr_demo,
454
+ inputs=[
455
+ gallery_input,
456
+ task_dropdown,
457
+ ocr_type_dropdown,
458
+ ocr_box_input,
459
+ ocr_color_dropdown,
460
+ ],
461
+ outputs=[output_markdown, output_html],
462
+ )
463
+ example = gr.Examples(
464
+ examples=[
465
+ [
466
+ "./sheet_music.png",
467
+ "Format Text OCR",
468
+ "format",
469
+ None,
470
+ None,
471
+ ],
472
+ [
473
+ "./latex.png",
474
+ "Format Text OCR",
475
+ "format",
476
+ None,
477
+ None,
478
+ ],
479
+ ],
480
+ inputs=[
481
+ image_input,
482
+ task_dropdown,
483
+ ocr_type_dropdown,
484
+ ocr_box_input,
485
+ ocr_color_dropdown,
486
+ ],
487
+ outputs=[output_markdown, output_html],
488
+ )
489
+ example_finegrained = gr.Examples(
490
+ examples=[
491
+ [
492
+ "./multi_box.png",
493
+ "Fine-grained OCR (Color)",
494
+ "ocr",
495
+ None,
496
+ "red",
497
+ ]
498
+ ],
499
+ inputs=[
500
+ image_editor,
501
+ task_dropdown,
502
+ ocr_type_dropdown,
503
+ ocr_box_input,
504
+ ocr_color_dropdown,
505
+ ],
506
+ outputs=[output_markdown, output_html],
507
+ label="Fine-grained example",
508
+ )
509
+
510
+ gr.Markdown(
511
+ "Space based on [Tonic's GOT-OCR](https://huggingface.co/spaces/Tonic/GOT-OCR)"
512
+ )
513
+
514
+
515
+ if __name__ == "__main__":
516
+ cleanup_old_files()
517
+ demo.launch()
globe.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title = """# GOT-OCR 2.0: Transformers 🤗 implementation demo"""
2
+
3
+ description = """
4
+ This demo utilizes the **Transformers implementation of GOT-OCR 2.0** to extract text from images.
5
+ The GOT-OCR 2.0 model was introduced in the paper:
6
+ [**General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model**](https://arxiv.org/abs/2409.01704)
7
+ by *Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, and Xiangyu Zhang*.
8
+
9
+ ### Key Features
10
+ GOT-OCR 2.0 is a **state-of-the-art OCR model** designed to handle a wide variety of tasks, including:
11
+
12
+ - **Plain Text OCR**
13
+ - **Formatted Text OCR**
14
+ - **Fine-grained OCR**
15
+ - **Multi-crop OCR**
16
+ - **Multi-page OCR**
17
+
18
+ ### Beyond Text
19
+ GOT-OCR 2.0 has also been fine-tuned to work with non-textual data, such as:
20
+
21
+ - **Charts and Tables**
22
+ - **Math and Molecular Formulas**
23
+ - **Geometric Shapes**
24
+ - **Sheet Music**
25
+
26
+ Explore the capabilities of this cutting-edge model through this interactive demo!
27
+ """
28
+
29
+ tasks = [
30
+ "Plain Text OCR",
31
+ "Format Text OCR",
32
+ "Fine-grained OCR (Box)",
33
+ "Fine-grained OCR (Color)",
34
+ "Multi-crop OCR",
35
+ "Multi-page OCR",
36
+ ]
37
+
38
+ ocr_types = ["ocr", "format"]
39
+ ocr_colors = ["red", "green", "blue"]
latex.png ADDED

Git LFS Details

  • SHA256: 47f3e4388a5efcb36da513213497adcaedf15e4a769557d6e0dac768ee961f78
  • Pointer size: 131 Bytes
  • Size of remote file: 435 kB
multi_box.png ADDED

Git LFS Details

  • SHA256: 841238eccecfae8e7c21b196326e519b57f681d35619d773c25b8643aaa823a1
  • Pointer size: 131 Bytes
  • Size of remote file: 697 kB
render.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation_dict = {
2
+ ",": ",",
3
+ "。": ".",
4
+ }
5
+ translation_table = str.maketrans(punctuation_dict)
6
+ stop_str = "<|im_end|>"
7
+
8
+
9
+ def svg_to_html(svg_content, output_filename):
10
+ html_content = f"""
11
+ <!DOCTYPE html>
12
+ <html lang="en">
13
+ <head>
14
+ <meta charset="UTF-8">
15
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
16
+ <title>SVG Embedded in HTML</title>
17
+ </head>
18
+ <body>
19
+ <svg width="2100" height="15000" xmlns="http://www.w3.org/2000/svg">
20
+ {svg_content}
21
+ </svg>
22
+ </body>
23
+ </html>
24
+ """
25
+
26
+ with open(output_filename, "w") as file:
27
+ file.write(html_content)
28
+
29
+
30
+ def render_ocr_text(text, result_path, format_text=False):
31
+ if text.endswith(stop_str):
32
+ text = text[: -len(stop_str)]
33
+ text = text.strip()
34
+
35
+ if "**kern" in text:
36
+ import verovio
37
+
38
+ tk = verovio.toolkit()
39
+ tk.loadData(text)
40
+ tk.setOptions(
41
+ {
42
+ "pageWidth": 2100,
43
+ "footer": "none",
44
+ "barLineWidth": 0.5,
45
+ "beamMaxSlope": 15,
46
+ "staffLineWidth": 0.2,
47
+ "spacingStaff": 6,
48
+ }
49
+ )
50
+ tk.getPageCount()
51
+ svg = tk.renderToSVG()
52
+ svg = svg.replace('overflow="inherit"', 'overflow="visible"')
53
+
54
+ svg_to_html(svg, result_path)
55
+
56
+ if format_text and "**kern" not in text:
57
+ if "\\begin{tikzpicture}" not in text:
58
+ html_path = "./render_tools/" + "/content-mmd-to-html.html"
59
+ right_num = text.count("\\right")
60
+ left_num = text.count("\left")
61
+
62
+ if right_num != left_num:
63
+ text = (
64
+ text.replace("\left(", "(")
65
+ .replace("\\right)", ")")
66
+ .replace("\left[", "[")
67
+ .replace("\\right]", "]")
68
+ .replace("\left{", "{")
69
+ .replace("\\right}", "}")
70
+ .replace("\left|", "|")
71
+ .replace("\\right|", "|")
72
+ .replace("\left.", ".")
73
+ .replace("\\right.", ".")
74
+ )
75
+
76
+ text = text.replace('"', "``").replace("$", "")
77
+
78
+ outputs_list = text.split("\n")
79
+ gt = ""
80
+ for out in outputs_list:
81
+ gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
82
+
83
+ gt = gt[:-2]
84
+
85
+ with open(html_path, "r") as web_f:
86
+ lines = web_f.read()
87
+ lines = lines.split("const text =")
88
+ new_web = lines[0] + "const text =" + gt + lines[1]
89
+ else:
90
+ html_path = "./render_tools/" + "/tikz.html"
91
+ text = text.translate(translation_table)
92
+ outputs_list = text.split("\n")
93
+ gt = ""
94
+ for out in outputs_list:
95
+ if out:
96
+ if (
97
+ "\\begin{tikzpicture}" not in out
98
+ and "\\end{tikzpicture}" not in out
99
+ ):
100
+ while out[-1] == " ":
101
+ out = out[:-1]
102
+ if out is None:
103
+ break
104
+
105
+ if out:
106
+ if out[-1] != ";":
107
+ gt += out[:-1] + ";\n"
108
+ else:
109
+ gt += out + "\n"
110
+ else:
111
+ gt += out + "\n"
112
+
113
+ with open(html_path, "r") as web_f:
114
+ lines = web_f.read()
115
+ lines = lines.split("const text =")
116
+ new_web = lines[0] + gt + lines[1]
117
+
118
+ with open(result_path, "w") as web_f_new:
119
+ web_f_new.write(new_web)
render_tools/content-mmd-to-html.html ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en" data-lt-installed="true"><head>
3
+ <meta charset="UTF-8">
4
+ <title>Title</title>
5
+ <script>
6
+ const text =
7
+ </script>
8
+ <style>
9
+ #content {
10
+ max-width: 800px;
11
+ margin: auto;
12
+ }
13
+ </style>
14
+ <script>
15
+ let script = document.createElement('script');
16
+ script.src = "https://cdn.jsdelivr.net/npm/[email protected]/es5/bundle.js";
17
+ document.head.append(script);
18
+
19
+ script.onload = function() {
20
+ const isLoaded = window.loadMathJax();
21
+ if (isLoaded) {
22
+ console.log('Styles loaded!')
23
+ }
24
+
25
+ const el = window.document.getElementById('content-text');
26
+ if (el) {
27
+ const options = {
28
+ htmlTags: true
29
+ };
30
+ const html = window.render(text, options);
31
+ el.outerHTML = html;
32
+ }
33
+ };
34
+ </script>
35
+ </head>
36
+ <body>
37
+ <div id="content"><div id="content-text"></div></div>
38
+ </body>
39
+ </html>
render_tools/tikz.html ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+
3
+ <html>
4
+
5
+ <head>
6
+ <meta charset="UTF-8">
7
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
8
+ <title>Document</title>
9
+ <link rel="stylesheet" type="text/css" href="https://tikzjax.com/v1/fonts.css">
10
+ <script src="https://tikzjax.com/v1/tikzjax.js"></script>
11
+ </head>
12
+ <body>
13
+ <script type="text/tikz">
14
+ const text =
15
+ </script>
16
+ </body>
17
+ </html>
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch==2.5.1
2
+ torchvision==0.20.1
3
+ git+https://github.com/yonigozlan/transformers.git@add-got-ocr2
4
+ verovio
5
+ opencv-python
6
+ numpy==1.26.3
7
+ pillow
sheet_music.png ADDED

Git LFS Details

  • SHA256: 2b4d14e87b3c854e0a665b5c48e5ea9aefb03b7d89262eff668abe9d113637c0
  • Pointer size: 131 Bytes
  • Size of remote file: 735 kB