yonigozlan's picture
yonigozlan HF staff
change buttons style
f67a5c0 verified
import base64
import os
import re
import shutil
import time
import uuid
from pathlib import Path
import cv2
import gradio as gr
import numpy as np
import spaces
import torch
from globe import description, title
from PIL import Image
from render import render_ocr_text
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.image_utils import load_image
model_name = "yonigozlan/GOT-OCR-2.0-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForImageTextToText.from_pretrained(
model_name, low_cpu_mem_usage=True, device_map=device
)
model = model.eval().to(device)
UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"
stop_str = "<|im_end|>"
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
if not os.path.exists(folder):
os.makedirs(folder)
input_index = 0
@spaces.GPU()
def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None):
if image is None:
return "Error: No image provided", None, None
unique_id = str(uuid.uuid4())
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.png")
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}.html")
try:
if not isinstance(image, (tuple, list)):
image = [image]
else:
image = [img[0] for img in image]
for i, img in enumerate(image):
if isinstance(img, dict):
composite_image = img.get("composite")
if composite_image is not None:
if isinstance(composite_image, np.ndarray):
cv2.imwrite(
image_path, cv2.cvtColor(composite_image, cv2.COLOR_RGB2BGR)
)
elif isinstance(composite_image, Image.Image):
composite_image.save(image_path)
else:
return (
"Error: Unsupported image format from ImageEditor",
None,
None,
)
else:
return (
"Error: No composite image found in ImageEditor output",
None,
None,
)
elif isinstance(img, np.ndarray):
cv2.imwrite(image_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
elif isinstance(img, str):
shutil.copy(img, image_path)
else:
return "Error: Unsupported image format", None, None
image[i] = load_image(image_path)
if task == "Plain Text OCR":
inputs = processor(image, return_tensors="pt").to("cuda")
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
return res, None, unique_id
else:
if task == "Format Text OCR":
inputs = processor(image, return_tensors="pt", format=True).to("cuda")
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
ocr_type = "format"
elif task == "Fine-grained OCR (Box)":
inputs = processor(image, return_tensors="pt", box=ocr_box).to("cuda")
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
elif task == "Fine-grained OCR (Color)":
inputs = processor(image, return_tensors="pt", color=ocr_color).to(
"cuda"
)
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
elif task == "Multi-crop OCR":
inputs = processor(
image,
return_tensors="pt",
format=True,
crop_to_patches=True,
max_patches=5,
).to("cuda")
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
ocr_type = "format"
elif task == "Multi-page OCR":
inputs = processor(
image, return_tensors="pt", multi_page=True, format=True
).to("cuda")
generate_ids = model.generate(
**inputs,
do_sample=False,
tokenizer=processor.tokenizer,
stop_strings=stop_str,
max_new_tokens=4096,
)
res = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
ocr_type = "format"
render_ocr_text(res, result_path, format_text=ocr_type == "format")
if os.path.exists(result_path):
with open(result_path, "r") as f:
html_content = f.read()
return res, html_content, unique_id
else:
return res, None, unique_id
except Exception as e:
return f"Error: {str(e)}", None, None
finally:
if os.path.exists(image_path):
os.remove(image_path)
def update_image_input(task):
if task == "Fine-grained OCR (Color)":
return (
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
)
elif task == "Multi-page OCR":
return (
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=True),
)
else:
return (
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
)
def update_inputs(task):
if task in [
"Plain Text OCR",
"Format Text OCR",
"Multi-crop OCR",
]:
return [
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
]
elif task == "Fine-grained OCR (Box)":
return [
gr.update(visible=True, choices=["ocr", "format"]),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
]
elif task == "Fine-grained OCR (Color)":
return [
gr.update(visible=True, choices=["ocr", "format"]),
gr.update(visible=False),
gr.update(visible=True, choices=["red", "green", "blue"]),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
]
elif task == "Multi-page OCR":
return [
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=True),
]
def parse_latex_output(res):
# Split the input, preserving newlines and empty lines
lines = re.split(r"(\$\$.*?\$\$)", res, flags=re.DOTALL)
parsed_lines = []
in_latex = False
latex_buffer = []
for line in lines:
if line == "\n":
if in_latex:
latex_buffer.append(line)
else:
parsed_lines.append(line)
continue
line = line.strip()
latex_patterns = [r"\{", r"\}", r"\[", r"\]", r"\\", r"\$", r"_", r"^", r'"']
contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
if contains_latex:
if not in_latex:
in_latex = True
latex_buffer = ["$$"]
latex_buffer.append(line)
else:
if in_latex:
latex_buffer.append("$$")
parsed_lines.extend(latex_buffer)
in_latex = False
latex_buffer = []
parsed_lines.append(line)
if in_latex:
latex_buffer.append("$$")
parsed_lines.extend(latex_buffer)
return "$$\\$$\n".join(parsed_lines)
def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
res, html_content, unique_id = process_image(
image, task, ocr_type, ocr_box, ocr_color
)
if isinstance(res, str) and res.startswith("Error:"):
return res, None
res = res.replace("\\title", "\\title ")
formatted_res = res
# formatted_res = parse_latex_output(res)
if html_content:
encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
iframe_src = f"data:text/html;base64,{encoded_html}"
iframe = f'<iframe src="{iframe_src}" width="100%" height="600px"></iframe>'
download_link = f'<a href="data:text/html;base64,{encoded_html}" download="result_{unique_id}.html">Download Full Result</a>'
return formatted_res, f"{download_link}<br>{iframe}"
return formatted_res, None
def cleanup_old_files():
current_time = time.time()
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
for file_path in Path(folder).glob("*"):
if current_time - file_path.stat().st_mtime > 3600: # 1 hour
file_path.unlink()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
with gr.Column(scale=1):
with gr.Group():
image_input = gr.Image(type="filepath", label="Input Image")
gallery_input = gr.Gallery(
type="filepath", label="Input images", visible=False
)
image_editor = gr.ImageEditor(
label="Image Editor", type="pil", visible=False
)
task_dropdown = gr.Dropdown(
choices=[
"Plain Text OCR",
"Format Text OCR",
"Fine-grained OCR (Box)",
"Fine-grained OCR (Color)",
"Multi-crop OCR",
"Multi-page OCR",
],
label="Select Task",
value="Plain Text OCR",
)
ocr_type_dropdown = gr.Dropdown(
choices=["ocr", "format"], label="OCR Type", visible=False
)
ocr_box_input = gr.Textbox(
label="OCR Box (x1,y1,x2,y2)",
placeholder="[100,100,200,200]",
visible=False,
)
ocr_color_dropdown = gr.Dropdown(
choices=["red", "green", "blue"], label="OCR Color", visible=False
)
# with gr.Row():
# max_new_tokens_slider = gr.Slider(50, 500, step=10, value=150, label="Max New Tokens")
# no_repeat_ngram_size_slider = gr.Slider(1, 10, step=1, value=2, label="No Repeat N-gram Size")
submit_button = gr.Button("Process", variant="primary")
editor_submit_button = gr.Button("Process Edited Image", visible=False, variant="primary")
gallery_submit_button = gr.Button(
"Process Multiple Images", visible=False, variant="primary"
)
with gr.Column(scale=1):
with gr.Group():
output_markdown = gr.Textbox(label="Text output")
output_html = gr.HTML(label="HTML output")
input_types = [
image_input,
image_editor,
gallery_input,
]
task_dropdown.change(
update_inputs,
inputs=[task_dropdown],
outputs=[
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
image_input,
image_editor,
submit_button,
editor_submit_button,
gallery_input,
gallery_submit_button,
],
)
task_dropdown.change(
update_image_input,
inputs=[task_dropdown],
outputs=[
image_input,
image_editor,
editor_submit_button,
gallery_input,
gallery_submit_button,
],
)
submit_button.click(
ocr_demo,
inputs=[
image_input,
task_dropdown,
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
],
outputs=[output_markdown, output_html],
)
editor_submit_button.click(
ocr_demo,
inputs=[
image_editor,
task_dropdown,
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
],
outputs=[output_markdown, output_html],
)
gallery_submit_button.click(
ocr_demo,
inputs=[
gallery_input,
task_dropdown,
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
],
outputs=[output_markdown, output_html],
)
example = gr.Examples(
examples=[
[
"./sheet_music.png",
"Format Text OCR",
"format",
None,
None,
],
[
"./latex.png",
"Format Text OCR",
"format",
None,
None,
],
],
inputs=[
image_input,
task_dropdown,
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
],
outputs=[output_markdown, output_html],
)
example_finegrained = gr.Examples(
examples=[
[
"./multi_box.png",
"Fine-grained OCR (Color)",
"ocr",
None,
"red",
]
],
inputs=[
image_editor,
task_dropdown,
ocr_type_dropdown,
ocr_box_input,
ocr_color_dropdown,
],
outputs=[output_markdown, output_html],
label="Fine-grained example",
)
gr.Markdown(
"Space based on [Tonic's GOT-OCR](https://huggingface.co./spaces/Tonic/GOT-OCR)"
)
if __name__ == "__main__":
cleanup_old_files()
demo.launch()