Spaces:

yonigozlan
/

GOT-OCR-Transformers

Running on Zero

App Files Files Community

yonigozlan HF staff commited on 27 days ago

Commit

2cb5324

1 Parent(s): add5814

initial commit

Browse files

Files changed (10) hide show

.gitattributes +3 -0
app.py +517 -0
globe.py +39 -0
latex.png +3 -0
multi_box.png +3 -0
render.py +119 -0
render_tools/content-mmd-to-html.html +39 -0
render_tools/tikz.html +17 -0
requirements.txt +7 -0
sheet_music.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+latex.png filter=lfs diff=lfs merge=lfs -text
+multi_box.png filter=lfs diff=lfs merge=lfs -text
+sheet_music.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import base64
+import os
+import re
+import shutil
+import time
+import uuid
+from pathlib import Path
+import cv2
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from globe import description, title
+from PIL import Image
+from render import render_ocr_text
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from transformers.image_utils import load_image
+model_name = "yonigozlan/GOT-OCR-2.0-hf"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_name, low_cpu_mem_usage=True, device_map=device
+)
+model = model.eval().to(device)
+UPLOAD_FOLDER = "./uploads"
+RESULTS_FOLDER = "./results"
+stop_str = "<|im_end|>"
+for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+input_index = 0
+@spaces.GPU()
+def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None):
+    if image is None:
+        return "Error: No image provided", None, None
+    unique_id = str(uuid.uuid4())
+    image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.png")
+    result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}.html")
+    try:
+        if not isinstance(image, (tuple, list)):
+            image = [image]
+        else:
+            image = [img[0] for img in image]
+        for i, img in enumerate(image):
+            if isinstance(img, dict):
+                composite_image = img.get("composite")
+                if composite_image is not None:
+                    if isinstance(composite_image, np.ndarray):
+                        cv2.imwrite(
+                            image_path, cv2.cvtColor(composite_image, cv2.COLOR_RGB2BGR)
+                        )
+                    elif isinstance(composite_image, Image.Image):
+                        composite_image.save(image_path)
+                    else:
+                        return (
+                            "Error: Unsupported image format from ImageEditor",
+                            None,
+                            None,
+                        )
+                else:
+                    return (
+                        "Error: No composite image found in ImageEditor output",
+                        None,
+                        None,
+                    )
+            elif isinstance(img, np.ndarray):
+                cv2.imwrite(image_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+            elif isinstance(img, str):
+                shutil.copy(img, image_path)
+            else:
+                return "Error: Unsupported image format", None, None
+            image[i] = load_image(image_path)
+        if task == "Plain Text OCR":
+            inputs = processor(image, return_tensors="pt").to("cuda")
+            generate_ids = model.generate(
+                **inputs,
+                do_sample=False,
+                tokenizer=processor.tokenizer,
+                stop_strings=stop_str,
+                max_new_tokens=4096,
+            )
+            res = processor.decode(
+                generate_ids[0, inputs["input_ids"].shape[1] :],
+                skip_special_tokens=True,
+            )
+            return res, None, unique_id
+        else:
+            if task == "Format Text OCR":
+                inputs = processor(image, return_tensors="pt", format=True).to("cuda")
+                generate_ids = model.generate(
+                    **inputs,
+                    do_sample=False,
+                    tokenizer=processor.tokenizer,
+                    stop_strings=stop_str,
+                    max_new_tokens=4096,
+                )
+                res = processor.decode(
+                    generate_ids[0, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+                ocr_type = "format"
+            elif task == "Fine-grained OCR (Box)":
+                inputs = processor(image, return_tensors="pt", box=ocr_box).to("cuda")
+                generate_ids = model.generate(
+                    **inputs,
+                    do_sample=False,
+                    tokenizer=processor.tokenizer,
+                    stop_strings=stop_str,
+                    max_new_tokens=4096,
+                )
+                res = processor.decode(
+                    generate_ids[0, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+            elif task == "Fine-grained OCR (Color)":
+                inputs = processor(image, return_tensors="pt", color=ocr_color).to(
+                    "cuda"
+                )
+                generate_ids = model.generate(
+                    **inputs,
+                    do_sample=False,
+                    tokenizer=processor.tokenizer,
+                    stop_strings=stop_str,
+                    max_new_tokens=4096,
+                )
+                res = processor.decode(
+                    generate_ids[0, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+            elif task == "Multi-crop OCR":
+                inputs = processor(
+                    image,
+                    return_tensors="pt",
+                    format=True,
+                    crop_to_patches=True,
+                    max_patches=5,
+                ).to("cuda")
+                generate_ids = model.generate(
+                    **inputs,
+                    do_sample=False,
+                    tokenizer=processor.tokenizer,
+                    stop_strings=stop_str,
+                    max_new_tokens=4096,
+                )
+                res = processor.decode(
+                    generate_ids[0, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+                ocr_type = "format"
+            elif task == "Multi-page OCR":
+                inputs = processor(
+                    image, return_tensors="pt", multi_page=True, format=True
+                ).to("cuda")
+                generate_ids = model.generate(
+                    **inputs,
+                    do_sample=False,
+                    tokenizer=processor.tokenizer,
+                    stop_strings=stop_str,
+                    max_new_tokens=4096,
+                )
+                res = processor.decode(
+                    generate_ids[0, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+                ocr_type = "format"
+            render_ocr_text(res, result_path, format_text=ocr_type == "format")
+            if os.path.exists(result_path):
+                with open(result_path, "r") as f:
+                    html_content = f.read()
+                return res, html_content, unique_id
+            else:
+                return res, None, unique_id
+    except Exception as e:
+        return f"Error: {str(e)}", None, None
+    finally:
+        if os.path.exists(image_path):
+            os.remove(image_path)
+def update_image_input(task):
+    if task == "Fine-grained OCR (Color)":
+        return (
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+    elif task == "Multi-page OCR":
+        return (
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=True),
+        )
+    else:
+        return (
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+def update_inputs(task):
+    if task in [
+        "Plain Text OCR",
+        "Format Text OCR",
+        "Multi-crop OCR",
+    ]:
+        return [
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        ]
+    elif task == "Fine-grained OCR (Box)":
+        return [
+            gr.update(visible=True, choices=["ocr", "format"]),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        ]
+    elif task == "Fine-grained OCR (Color)":
+        return [
+            gr.update(visible=True, choices=["ocr", "format"]),
+            gr.update(visible=False),
+            gr.update(visible=True, choices=["red", "green", "blue"]),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        ]
+    elif task == "Multi-page OCR":
+        return [
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=True),
+        ]
+def parse_latex_output(res):
+    # Split the input, preserving newlines and empty lines
+    lines = re.split(r"(\$\$.*?\$\$)", res, flags=re.DOTALL)
+    parsed_lines = []
+    in_latex = False
+    latex_buffer = []
+    for line in lines:
+        if line == "\n":
+            if in_latex:
+                latex_buffer.append(line)
+            else:
+                parsed_lines.append(line)
+            continue
+        line = line.strip()
+        latex_patterns = [r"\{", r"\}", r"\[", r"\]", r"\\", r"\$", r"_", r"^", r'"']
+        contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
+        if contains_latex:
+            if not in_latex:
+                in_latex = True
+                latex_buffer = ["$$"]
+            latex_buffer.append(line)
+        else:
+            if in_latex:
+                latex_buffer.append("$$")
+                parsed_lines.extend(latex_buffer)
+                in_latex = False
+                latex_buffer = []
+            parsed_lines.append(line)
+    if in_latex:
+        latex_buffer.append("$$")
+        parsed_lines.extend(latex_buffer)
+    return "$$\\$$\n".join(parsed_lines)
+def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
+    res, html_content, unique_id = process_image(
+        image, task, ocr_type, ocr_box, ocr_color
+    )
+    if isinstance(res, str) and res.startswith("Error:"):
+        return res, None
+    res = res.replace("\\title", "\\title ")
+    formatted_res = res
+    # formatted_res = parse_latex_output(res)
+    if html_content:
+        encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
+        iframe_src = f"data:text/html;base64,{encoded_html}"
+        iframe = f'<iframe src="{iframe_src}" width="100%" height="600px"></iframe>'
+        download_link = f'<a href="data:text/html;base64,{encoded_html}" download="result_{unique_id}.html">Download Full Result</a>'
+        return formatted_res, f"{download_link}<br>{iframe}"
+    return formatted_res, None
+def cleanup_old_files():
+    current_time = time.time()
+    for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
+        for file_path in Path(folder).glob("*"):
+            if current_time - file_path.stat().st_mtime > 3600:  # 1 hour
+                file_path.unlink()
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group():
+                image_input = gr.Image(type="filepath", label="Input Image")
+                gallery_input = gr.Gallery(
+                    type="filepath", label="Input images", visible=False
+                )
+                image_editor = gr.ImageEditor(
+                    label="Image Editor", type="pil", visible=False
+                )
+                task_dropdown = gr.Dropdown(
+                    choices=[
+                        "Plain Text OCR",
+                        "Format Text OCR",
+                        "Fine-grained OCR (Box)",
+                        "Fine-grained OCR (Color)",
+                        "Multi-crop OCR",
+                        "Multi-page OCR",
+                    ],
+                    label="Select Task",
+                    value="Plain Text OCR",
+                )
+                ocr_type_dropdown = gr.Dropdown(
+                    choices=["ocr", "format"], label="OCR Type", visible=False
+                )
+                ocr_box_input = gr.Textbox(
+                    label="OCR Box (x1,y1,x2,y2)",
+                    placeholder="[100,100,200,200]",
+                    visible=False,
+                )
+                ocr_color_dropdown = gr.Dropdown(
+                    choices=["red", "green", "blue"], label="OCR Color", visible=False
+                )
+                # with gr.Row():
+                # max_new_tokens_slider = gr.Slider(50, 500, step=10, value=150, label="Max New Tokens")
+                # no_repeat_ngram_size_slider = gr.Slider(1, 10, step=1, value=2, label="No Repeat N-gram Size")
+                submit_button = gr.Button("Process")
+                editor_submit_button = gr.Button("Process Edited Image", visible=False)
+                gallery_submit_button = gr.Button(
+                    "Process Multiple Images", visible=False
+                )
+        with gr.Column(scale=1):
+            with gr.Group():
+                output_markdown = gr.Textbox(label="Text output")
+                output_html = gr.HTML(label="HTML output")
+    input_types = [
+        image_input,
+        image_editor,
+        gallery_input,
+    ]
+    task_dropdown.change(
+        update_inputs,
+        inputs=[task_dropdown],
+        outputs=[
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+            image_input,
+            image_editor,
+            submit_button,
+            editor_submit_button,
+            gallery_input,
+            gallery_submit_button,
+        ],
+    )
+    task_dropdown.change(
+        update_image_input,
+        inputs=[task_dropdown],
+        outputs=[
+            image_input,
+            image_editor,
+            editor_submit_button,
+            gallery_input,
+            gallery_submit_button,
+        ],
+    )
+    submit_button.click(
+        ocr_demo,
+        inputs=[
+            image_input,
+            task_dropdown,
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+        ],
+        outputs=[output_markdown, output_html],
+    )
+    editor_submit_button.click(
+        ocr_demo,
+        inputs=[
+            image_editor,
+            task_dropdown,
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+        ],
+        outputs=[output_markdown, output_html],
+    )
+    gallery_submit_button.click(
+        ocr_demo,
+        inputs=[
+            gallery_input,
+            task_dropdown,
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+        ],
+        outputs=[output_markdown, output_html],
+    )
+    example = gr.Examples(
+        examples=[
+            [
+                "./sheet_music.png",
+                "Format Text OCR",
+                "format",
+                None,
+                None,
+            ],
+            [
+                "./latex.png",
+                "Format Text OCR",
+                "format",
+                None,
+                None,
+            ],
+        ],
+        inputs=[
+            image_input,
+            task_dropdown,
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+        ],
+        outputs=[output_markdown, output_html],
+    )
+    example_finegrained = gr.Examples(
+        examples=[
+            [
+                "./multi_box.png",
+                "Fine-grained OCR (Color)",
+                "ocr",
+                None,
+                "red",
+            ]
+        ],
+        inputs=[
+            image_editor,
+            task_dropdown,
+            ocr_type_dropdown,
+            ocr_box_input,
+            ocr_color_dropdown,
+        ],
+        outputs=[output_markdown, output_html],
+        label="Fine-grained example",
+    )
+    gr.Markdown(
+        "Space based on [Tonic's GOT-OCR](https://huggingface.co/spaces/Tonic/GOT-OCR)"
+    )
+if __name__ == "__main__":
+    cleanup_old_files()
+    demo.launch()

globe.py ADDED Viewed

	@@ -0,0 +1,39 @@

+title = """# GOT-OCR 2.0: Transformers 🤗 implementation demo"""
+description = """
+This demo utilizes the **Transformers implementation of GOT-OCR 2.0** to extract text from images.
+The GOT-OCR 2.0 model was introduced in the paper:
+[**General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model**](https://arxiv.org/abs/2409.01704)
+by *Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, and Xiangyu Zhang*.
+### Key Features
+GOT-OCR 2.0 is a **state-of-the-art OCR model** designed to handle a wide variety of tasks, including:
+- **Plain Text OCR**
+- **Formatted Text OCR**
+- **Fine-grained OCR**
+- **Multi-crop OCR**
+- **Multi-page OCR**
+### Beyond Text
+GOT-OCR 2.0 has also been fine-tuned to work with non-textual data, such as:
+- **Charts and Tables**
+- **Math and Molecular Formulas**
+- **Geometric Shapes**
+- **Sheet Music**
+Explore the capabilities of this cutting-edge model through this interactive demo!
+"""
+tasks = [
+    "Plain Text OCR",
+    "Format Text OCR",
+    "Fine-grained OCR (Box)",
+    "Fine-grained OCR (Color)",
+    "Multi-crop OCR",
+    "Multi-page OCR",
+]
+ocr_types = ["ocr", "format"]
+ocr_colors = ["red", "green", "blue"]

latex.png ADDED Viewed

Git LFS Details

SHA256: 47f3e4388a5efcb36da513213497adcaedf15e4a769557d6e0dac768ee961f78
Pointer size: 131 Bytes
Size of remote file: 435 kB

multi_box.png ADDED Viewed

Git LFS Details

SHA256: 841238eccecfae8e7c21b196326e519b57f681d35619d773c25b8643aaa823a1
Pointer size: 131 Bytes
Size of remote file: 697 kB

render.py ADDED Viewed

	@@ -0,0 +1,119 @@

+punctuation_dict = {
+    "，": ",",
+    "。": ".",
+}
+translation_table = str.maketrans(punctuation_dict)
+stop_str = "<|im_end|>"
+def svg_to_html(svg_content, output_filename):
+    html_content = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>SVG Embedded in HTML</title>
+    </head>
+    <body>
+        <svg width="2100" height="15000" xmlns="http://www.w3.org/2000/svg">
+            {svg_content}
+        </svg>
+    </body>
+    </html>
+    """
+    with open(output_filename, "w") as file:
+        file.write(html_content)
+def render_ocr_text(text, result_path, format_text=False):
+    if text.endswith(stop_str):
+        text = text[: -len(stop_str)]
+    text = text.strip()
+    if "**kern" in text:
+        import verovio
+        tk = verovio.toolkit()
+        tk.loadData(text)
+        tk.setOptions(
+            {
+                "pageWidth": 2100,
+                "footer": "none",
+                "barLineWidth": 0.5,
+                "beamMaxSlope": 15,
+                "staffLineWidth": 0.2,
+                "spacingStaff": 6,
+            }
+        )
+        tk.getPageCount()
+        svg = tk.renderToSVG()
+        svg = svg.replace('overflow="inherit"', 'overflow="visible"')
+        svg_to_html(svg, result_path)
+    if format_text and "**kern" not in text:
+        if "\\begin{tikzpicture}" not in text:
+            html_path = "./render_tools/" + "/content-mmd-to-html.html"
+            right_num = text.count("\\right")
+            left_num = text.count("\left")
+            if right_num != left_num:
+                text = (
+                    text.replace("\left(", "(")
+                    .replace("\\right)", ")")
+                    .replace("\left[", "[")
+                    .replace("\\right]", "]")
+                    .replace("\left{", "{")
+                    .replace("\\right}", "}")
+                    .replace("\left|", "|")
+                    .replace("\\right|", "|")
+                    .replace("\left.", ".")
+                    .replace("\\right.", ".")
+                )
+            text = text.replace('"', "``").replace("$", "")
+            outputs_list = text.split("\n")
+            gt = ""
+            for out in outputs_list:
+                gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
+            gt = gt[:-2]
+            with open(html_path, "r") as web_f:
+                lines = web_f.read()
+                lines = lines.split("const text =")
+                new_web = lines[0] + "const text =" + gt + lines[1]
+        else:
+            html_path = "./render_tools/" + "/tikz.html"
+            text = text.translate(translation_table)
+            outputs_list = text.split("\n")
+            gt = ""
+            for out in outputs_list:
+                if out:
+                    if (
+                        "\\begin{tikzpicture}" not in out
+                        and "\\end{tikzpicture}" not in out
+                    ):
+                        while out[-1] == " ":
+                            out = out[:-1]
+                            if out is None:
+                                break
+                        if out:
+                            if out[-1] != ";":
+                                gt += out[:-1] + ";\n"
+                            else:
+                                gt += out + "\n"
+                    else:
+                        gt += out + "\n"
+            with open(html_path, "r") as web_f:
+                lines = web_f.read()
+                lines = lines.split("const text =")
+                new_web = lines[0] + gt + lines[1]
+        with open(result_path, "w") as web_f_new:
+            web_f_new.write(new_web)

render_tools/content-mmd-to-html.html ADDED Viewed

	@@ -0,0 +1,39 @@

+<!DOCTYPE html>
+<html lang="en" data-lt-installed="true"><head>
+  <meta charset="UTF-8">
+  <title>Title</title>
+  <script>
+    const text =
+  </script>
+  <style>
+    #content {
+      max-width: 800px;
+      margin: auto;
+    }
+  </style>
+  <script>
+    let script = document.createElement('script');
+    script.src = "https://cdn.jsdelivr.net/npm/[email protected]/es5/bundle.js";
+    document.head.append(script);
+    script.onload = function() {
+      const isLoaded = window.loadMathJax();
+      if (isLoaded) {
+        console.log('Styles loaded!')
+      }
+      const el = window.document.getElementById('content-text');
+      if (el) {
+        const options = {
+          htmlTags: true
+        };
+        const html = window.render(text, options);
+        el.outerHTML = html;
+      }
+    };
+  </script>
+</head>
+<body>
+  <div id="content"><div id="content-text"></div></div>
+</body>
+</html>

render_tools/tikz.html ADDED Viewed

	@@ -0,0 +1,17 @@

+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Document</title>
+<link rel="stylesheet" type="text/css" href="https://tikzjax.com/v1/fonts.css">
+<script src="https://tikzjax.com/v1/tikzjax.js"></script>
+</head>
+<body>
+<script type="text/tikz">
+const text =
+</script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.5.1
+torchvision==0.20.1
+git+https://github.com/yonigozlan/transformers.git@add-got-ocr2
+verovio
+opencv-python
+numpy==1.26.3
+pillow

sheet_music.png ADDED Viewed

Git LFS Details

SHA256: 2b4d14e87b3c854e0a665b5c48e5ea9aefb03b7d89262eff668abe9d113637c0
Pointer size: 131 Bytes
Size of remote file: 735 kB