# Reference: https://huggingface.co./spaces/haotiz/glip-zeroshot-demo/blob/main/app.py 

import requests
import os
from io import BytesIO
from PIL import Image
import numpy as np
from pathlib import Path
import gradio as gr

import warnings

warnings.filterwarnings("ignore")

from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo

config_file = "configs/pretrain_new/desco_glip.yaml"
weight_file = "MODEL/desco_glip_tiny.pth"

# update the config options with the config file
# manual override some options
cfg.local_rank = 0
cfg.num_gpus = 1
cfg.merge_from_file(config_file)
cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
cfg.merge_from_list(["MODEL.DEVICE", "cuda"])

glip_demo = GLIPDemo(
    cfg,
    min_image_size=800,
    confidence_threshold=0.7,
    show_mask_heatmaps=False
)

config_file = "configs/pretrain_new/desco_fiber.yaml"
weight_file = "MODEL/desco_fiber_base.pth"
from copy import deepcopy
cfg = deepcopy(cfg)
cfg.merge_from_file(config_file)
cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
fiber_demo = GLIPDemo(
    cfg,
    min_image_size=800,
    confidence_threshold=0.7,
    show_mask_heatmaps=False
)

athetics_params = {
    "skip_name": False, # whether we overlay the phrase over the box
    "override_color": (0, 90, 190),
    "text_size": 1.0,
    "text_pixel": 3,
    "box_alpha": 1.0,
    "box_pixel": 5,
    "text_offset_original": 8, # distance between text and box
}

def predict(image, text, ground_tokens=""):
    ground_tokens = None if ground_tokens.strip() == "" else ground_tokens.strip().split(";")
    result, _ = glip_demo.run_on_web_image(deepcopy(image[:, :, [2, 1, 0]]), text, 0.5, ground_tokens, **athetics_params)
    fiber_result, _ = fiber_demo.run_on_web_image(deepcopy(image[:, :, [2, 1, 0]]), text, 0.5, ground_tokens, **athetics_params)
    return result[:, :, [2, 1, 0]], fiber_result[:, :, [2, 1, 0]]


image = gr.inputs.Image()


gr.Interface(
    description="Object Recognition with DesCo (https://github.com/liunian-harold-li/DesCo)",
    fn=predict,
    inputs=["image", "text", "text"],
    outputs=[
        gr.outputs.Image(
            type="pil",
            label="DesCo-GLIP"
        ),
        gr.outputs.Image(
            type="pil",
            label="DesCo-FIBER"
        ),
    ],
    examples=[
        ["./1.jpg", "A clown making a balloon animal for a pretty lady.", "clown"],
        ["./1.jpg", "A clown kicking a soccer ball for a pretty lady.", "clown"],
        ["./2.jpg", "A kind of tool, wooden handle with a round head.", "tool"],
        ["./3.jpg", "Bumblebee, yellow with black accents.", "Bumblebee"],
    ],
    article=Path("docs/intro.md").read_text()
).launch()