Glyph-SDXL-v2

Paused

File size: 56,267 Bytes

import gc
import json
import webcolors
import spaces
import gradio as gr
import os.path as osp
from copy import deepcopy
from PIL import Image, ImageDraw, ImageFont

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL
from diffusers.models.attention import BasicTransformerBlock
from peft import LoraConfig
from peft.utils import set_peft_model_state_dict
from transformers import PretrainedConfig

from diffusers import DPMSolverMultistepScheduler

from glyph_sdxl.utils import (
    parse_config,
    UNET_CKPT_NAME,
    huggingface_cache_dir,
    load_byt5_and_byt5_tokenizer,
    BYT5_MAPPER_CKPT_NAME,
    INSERTED_ATTN_CKPT_NAME,
    BYT5_CKPT_NAME,
    PromptFormat,
    MultilingualPromptFormat,
)
from glyph_sdxl.custom_diffusers import (
    StableDiffusionGlyphXLPipeline,
    CrossAttnInsertBasicTransformerBlock,
)
from glyph_sdxl.modules import T5EncoderBlockByT5Mapper
from demo.constants import MAX_TEXT_BOX


state = 0
stack = []
multilingual_state = 0
multilingual_stack = []
font = ImageFont.truetype("assets/Arial.ttf", 20)

device = "cuda"
pipeline = None
pipeline_multilingual = None
prompt_format = PromptFormat()
multilingual_prompt_format = MultilingualPromptFormat()

multilingual_code_dict = {
    'cn': 'Chinese',
    'en': 'English',
    'fr': 'French',
    'de': 'German',
    'es': 'Spanish',
    'it': 'Italian',
    'pt': 'Portuguese',
    'ru': 'Russian',
    'jp': 'Japanese',
    'kr': 'Korean',
}
multilingual_reverse_code_dict = {
    'Chinese': 'cn',
    'English': 'en',
    'French': 'en',
    'German': 'en',
    'Spanish': 'en',
    'Italian': 'en',
    'Portuguese': 'en',
    'Russian': 'en',
    'Japanese': 'jp',
    'Korean': 'kr',
}
multilingual_font_dict = {}
multilingual_meta_path = 'assets/multi_fonts'

for code in multilingual_code_dict:
    with open(osp.join(multilingual_meta_path, f"{code}.json"), 'r') as f:
        lang_font_list = json.load(f)
    multilingual_font_dict[code] = lang_font_list


def flush():
    gc.collect()
    torch.cuda.empty_cache()

def import_model_class_from_model_name_or_path(
    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder",
):
    text_encoder_config = PretrainedConfig.from_pretrained(
        pretrained_model_name_or_path, 
        subfolder=subfolder, 
        revision=revision,
    )
    model_class = text_encoder_config.architectures[0]

    if model_class == "CLIPTextModel":
        from transformers import CLIPTextModel

        return CLIPTextModel
    elif model_class == "CLIPTextModelWithProjection":
        from transformers import CLIPTextModelWithProjection

        return CLIPTextModelWithProjection
    else:
        raise ValueError(f"{model_class} is not supported.")

def init_pipeline():

    global pipeline
    global pipeline_multilingual

    config = parse_config('configs/glyph_sdxl_albedo.py')
    ckpt_dir = 'checkpoints/glyph-sdxl'
    config_multilingual = parse_config('configs/glyph_sdxl_multilingual_albedo.py')
    ckpt_dir_multilingual = 'checkpoints/glyph-sdxl_multilingual_10-lang'

    text_encoder_cls_one = import_model_class_from_model_name_or_path(
        config.pretrained_model_name_or_path, config.revision,
    )
    text_encoder_cls_two = import_model_class_from_model_name_or_path(
        config.pretrained_model_name_or_path, config.revision, subfolder="text_encoder_2",
    )
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        config.pretrained_model_name_or_path, subfolder="text_encoder", revision=config.revision,
        cache_dir=huggingface_cache_dir,
    )
    text_encoder_two = text_encoder_cls_two.from_pretrained(
        config.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=config.revision,
        cache_dir=huggingface_cache_dir,
    )

    unet = UNet2DConditionModel.from_pretrained(
        config.pretrained_model_name_or_path, 
        subfolder="unet", 
        revision=config.revision,
        cache_dir=huggingface_cache_dir,
    )
    unet_multilingual = UNet2DConditionModel.from_pretrained(
        config_multilingual.pretrained_model_name_or_path, 
        subfolder="unet", 
        revision=config.revision,
        cache_dir=huggingface_cache_dir,
    )

    vae_path = (
        config.pretrained_model_name_or_path
        if config.pretrained_vae_model_name_or_path is None
        else config.pretrained_vae_model_name_or_path
    )
    vae = AutoencoderKL.from_pretrained(
        vae_path, subfolder="vae" if config.pretrained_vae_model_name_or_path is None else None, 
        revision=config.revision,
        cache_dir=huggingface_cache_dir,
    )

    byt5_model, byt5_tokenizer = load_byt5_and_byt5_tokenizer(
        **config.byt5_config,
        huggingface_cache_dir=huggingface_cache_dir,
    )
    byt5_model_multilingual, byt5_tokenizer_multilingual = load_byt5_and_byt5_tokenizer(
        **config_multilingual.byt5_config,
        huggingface_cache_dir=huggingface_cache_dir,
    )

    inference_dtype = torch.float32
    if config.inference_dtype == "fp16":
        inference_dtype = torch.float16
    elif config.inference_dtype == "bf16":
        inference_dtype = torch.bfloat16

    inserted_new_modules_para_set = set()
    for name, module in unet.named_modules():
        if isinstance(module, BasicTransformerBlock) and name in config.attn_block_to_modify:
            parent_module = unet
            for n in name.split(".")[:-1]:
                parent_module = getattr(parent_module, n)
            new_block = CrossAttnInsertBasicTransformerBlock.from_transformer_block(
                module,
                byt5_model.config.d_model if config.byt5_mapper_config.sdxl_channels is None else config.byt5_mapper_config.sdxl_channels,
            )
            new_block.requires_grad_(False)
            for inserted_module_name, inserted_module in zip(
                new_block.get_inserted_modules_names(), 
                new_block.get_inserted_modules()
            ):
                inserted_module.requires_grad_(True)
                for para_name, para in inserted_module.named_parameters():
                    para_key = name + '.' + inserted_module_name + '.' + para_name
                    assert para_key not in inserted_new_modules_para_set
                    inserted_new_modules_para_set.add(para_key)
            for origin_module in new_block.get_origin_modules():
                origin_module.to(dtype=inference_dtype)
            parent_module.register_module(name.split(".")[-1], new_block)
            print(f"inserted cross attn block to {name}")

    inserted_new_modules_para_set_multilingual = set()
    for name, module in unet_multilingual.named_modules():
        if isinstance(module, BasicTransformerBlock) and name in config_multilingual.attn_block_to_modify:
            parent_module = unet_multilingual
            for n in name.split(".")[:-1]:
                parent_module = getattr(parent_module, n)
            new_block = CrossAttnInsertBasicTransformerBlock.from_transformer_block(
                module,
                byt5_model.config.d_model if config_multilingual.byt5_mapper_config.sdxl_channels is None else config_multilingual.byt5_mapper_config.sdxl_channels,
            )
            new_block.requires_grad_(False)
            for inserted_module_name, inserted_module in zip(
                new_block.get_inserted_modules_names(), 
                new_block.get_inserted_modules()
            ):
                inserted_module.requires_grad_(True)
                for para_name, para in inserted_module.named_parameters():
                    para_key = name + '.' + inserted_module_name + '.' + para_name
                    assert para_key not in inserted_new_modules_para_set_multilingual
                    inserted_new_modules_para_set_multilingual.add(para_key)
            for origin_module in new_block.get_origin_modules():
                origin_module.to(dtype=inference_dtype)
            parent_module.register_module(name.split(".")[-1], new_block)
            print(f"inserted cross attn block to {name}")

    byt5_mapper_dict = [T5EncoderBlockByT5Mapper]
    byt5_mapper_dict = {mapper.__name__: mapper for mapper in byt5_mapper_dict}
    byt5_mapper = byt5_mapper_dict[config.byt5_mapper_type](
        byt5_model.config,
        **config.byt5_mapper_config,
    )
    byt5_mapper_multilingual = byt5_mapper_dict[config_multilingual.byt5_mapper_type](
        byt5_model.config,
        **config_multilingual.byt5_mapper_config,
    )

    unet_lora_target_modules = [
        "attn1.to_k", "attn1.to_q", "attn1.to_v", "attn1.to_out.0",
        "attn2.to_k", "attn2.to_q", "attn2.to_v", "attn2.to_out.0",
    ]
    unet_lora_config = LoraConfig(
        r=config.unet_lora_rank,
        lora_alpha=config.unet_lora_rank,
        init_lora_weights="gaussian",
        target_modules=unet_lora_target_modules,
    )
    unet.add_adapter(unet_lora_config)
    unet_lora_config_multilingual = LoraConfig(
        r=config_multilingual.unet_lora_rank,
        lora_alpha=config_multilingual.unet_lora_rank,
        init_lora_weights="gaussian",
        target_modules=unet_lora_target_modules,
    )
    unet_multilingual.add_adapter(unet_lora_config_multilingual)

    unet_lora_layers_para = torch.load(osp.join(ckpt_dir, UNET_CKPT_NAME), map_location='cpu')
    incompatible_keys = set_peft_model_state_dict(unet, unet_lora_layers_para, adapter_name="default")
    if getattr(incompatible_keys, 'unexpected_keys', []) == []:
        print(f"loaded unet_lora_layers_para")
    else:
        print(f"unet_lora_layers has unexpected_keys: {getattr(incompatible_keys, 'unexpected_keys', None)}")
    unet_lora_layers_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, UNET_CKPT_NAME), map_location='cpu')
    incompatible_keys = set_peft_model_state_dict(unet_multilingual, unet_lora_layers_para_multilingual, adapter_name="default")
    if getattr(incompatible_keys, 'unexpected_keys', []) == []:
        print(f"loaded unet_lora_layers_para_multilingual")
    else:
        print(f"unet_lora_layers_multilingual has unexpected_keys: {getattr(incompatible_keys, 'unexpected_keys', None)}")

    inserted_attn_module_paras = torch.load(osp.join(ckpt_dir, INSERTED_ATTN_CKPT_NAME), map_location='cpu')
    missing_keys, unexpected_keys = unet.load_state_dict(inserted_attn_module_paras, strict=False)
    assert len(unexpected_keys) == 0, unexpected_keys
    inserted_attn_module_paras_multilingual = torch.load(osp.join(ckpt_dir_multilingual, INSERTED_ATTN_CKPT_NAME), map_location='cpu')
    missing_keys, unexpected_keys = unet_multilingual.load_state_dict(inserted_attn_module_paras_multilingual, strict=False)
    assert len(unexpected_keys) == 0, unexpected_keys

    byt5_mapper_para = torch.load(osp.join(ckpt_dir, BYT5_MAPPER_CKPT_NAME), map_location='cpu')
    byt5_mapper.load_state_dict(byt5_mapper_para)
    byt5_mapper_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, BYT5_MAPPER_CKPT_NAME), map_location='cpu')
    byt5_mapper_multilingual.load_state_dict(byt5_mapper_para_multilingual)

    byt5_model_para = torch.load(osp.join(ckpt_dir, BYT5_CKPT_NAME), map_location='cpu')
    byt5_model.load_state_dict(byt5_model_para)
    byt5_model_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, BYT5_CKPT_NAME), map_location='cpu')
    byt5_model_multilingual.load_state_dict(byt5_model_para_multilingual)

    pipeline = StableDiffusionGlyphXLPipeline.from_pretrained(
        config.pretrained_model_name_or_path, 
        vae=vae, 
        text_encoder=text_encoder_one,
        text_encoder_2=text_encoder_two,
        byt5_text_encoder=byt5_model,
        byt5_tokenizer=byt5_tokenizer,
        byt5_mapper=byt5_mapper,
        unet=unet,
        byt5_max_length=config.byt5_max_length,
        revision=config.revision,
        torch_dtype=inference_dtype,
        safety_checker=None,
        cache_dir=huggingface_cache_dir,
    )

    pipeline.scheduler = DPMSolverMultistepScheduler.from_pretrained(
        config.pretrained_model_name_or_path,
        subfolder="scheduler",
        use_karras_sigmas=True,
    )

    pipeline_multilingual = StableDiffusionGlyphXLPipeline.from_pretrained(
        config_multilingual.pretrained_model_name_or_path, 
        vae=vae, 
        text_encoder=text_encoder_one,
        text_encoder_2=text_encoder_two,
        byt5_text_encoder=byt5_model_multilingual,
        byt5_tokenizer=byt5_tokenizer_multilingual,
        byt5_mapper=byt5_mapper_multilingual,
        unet=unet_multilingual,
        byt5_max_length=config_multilingual.byt5_max_length,
        revision=config_multilingual.revision,
        torch_dtype=inference_dtype,
        safety_checker=None,
        cache_dir=huggingface_cache_dir,
    )

    pipeline_multilingual.scheduler = DPMSolverMultistepScheduler.from_pretrained(
        config_multilingual.pretrained_model_name_or_path,
        subfolder="scheduler",
        use_karras_sigmas=True,
    )

    # move to gpu
    if config.pretrained_vae_model_name_or_path is None:
        vae = vae.to(device, dtype=torch.float32)
    else:
        vae = vae.to(device, dtype=inference_dtype)
    text_encoder_one = text_encoder_one.to(device, dtype=inference_dtype)
    text_encoder_two = text_encoder_two.to(device, dtype=inference_dtype)
    byt5_mapper = byt5_mapper.to(device)
    byt5_model = byt5_model.to(device)
    unet = unet.to(device, dtype=inference_dtype)
    pipeline = pipeline.to(device)

    byt5_mapper_multilingual = byt5_mapper_multilingual.to(device)
    byt5_model_multilingual = byt5_model_multilingual.to(device)
    unet_multilingual = unet_multilingual.to(device, dtype=inference_dtype)
    pipeline_multilingual = pipeline_multilingual.to(device)

def get_pixels(
    box_sketch_template,
    evt: gr.SelectData
):
    global state
    global stack

    text_position = evt.index

    if state == 0:
        stack.append(text_position)
        state = 1
    else:
        x, y = stack.pop()
        stack.append([x, y, text_position[0], text_position[1]])
        state = 0

    print(stack)

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i + 1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
            
            draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))

    return box_sketch_template

def get_pixels_multilingual(
    box_sketch_template,
    evt: gr.SelectData
):
    global multilingual_state
    global multilingual_stack

    text_position = evt.index

    if multilingual_state == 0:
        multilingual_stack.append(text_position)
        multilingual_state = 1
    else:
        x, y = multilingual_stack.pop()
        multilingual_stack.append([x, y, text_position[0], text_position[1]])
        multilingual_state = 0

    print(multilingual_stack)

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(multilingual_stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i + 1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
            
            draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))

    return box_sketch_template

def exe_redo(
    box_sketch_template
):
    global state
    global stack

    state = 1 - state
    if len(stack[-1]) == 2:
        stack = stack[:-1]
    else:
        x, y, _, _ = stack[-1]
        stack = stack[:-1] + [[x, y]]

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i+1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint, rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i+1), font=font, fill=text_color)

            draw.rectangle((x0,y0,x1,y1), outline=(255, 0, 0))

    return box_sketch_template

def exe_redo_multilingual(
    box_sketch_template
):
    global multilingual_state
    global multilingual_stack

    multilingual_state = 1 - multilingual_state
    if len(multilingual_stack[-1]) == 2:
        multilingual_stack = multilingual_stack[:-1]
    else:
        x, y, _, _ = multilingual_stack[-1]
        multilingual_stack = multilingual_stack[:-1] + [[x, y]]

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(multilingual_stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i+1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint, rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i+1), font=font, fill=text_color)

            draw.rectangle((x0,y0,x1,y1), outline=(255, 0, 0))

    return box_sketch_template

def exe_undo(
    box_sketch_template
):
    global state
    global stack
    
    state = 0
    stack = []
    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))

    return box_sketch_template

def exe_undo_multilingual(
    box_sketch_template
):
    global multilingual_state
    global multilingual_stack
    
    multilingual_state = 0
    multilingual_stack = []
    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))

    return box_sketch_template

def process_box():

    visibilities = []
    for _ in range(MAX_TEXT_BOX + 1):
        visibilities.append(gr.update(visible=False))
    for n in range(len(stack) + 1):
        visibilities[n] = gr.update(visible=True)
    
    # return [gr.update(visible=True), binary_matrixes, *visibilities, *colors]
    return [gr.update(visible=True), *visibilities]

def process_box_multilingual():

    visibilities = []
    for _ in range(MAX_TEXT_BOX + 1):
        visibilities.append(gr.update(visible=False))
    for n in range(len(multilingual_stack) + 1):
        visibilities[n] = gr.update(visible=True)
    
    # return [gr.update(visible=True), binary_matrixes, *visibilities, *colors]
    return [gr.update(visible=True), *visibilities]

@torch.inference_mode()
@spaces.GPU(enable_queue=True, duration=120)
def generate_image(bg_prompt, bg_class, bg_tags, seed, cfg, *conditions):

    stack_cp = deepcopy(stack)
    print(f"conditions: {conditions}")
    
    # 1. parse input
    prompts = []
    colors = []
    font_type = []
    bboxes = []
    num_boxes = len(stack_cp) if len(stack_cp[-1]) == 4 else len(stack_cp) - 1
    for i in range(num_boxes):
        prompts.append(conditions[i])
        colors.append(conditions[i + MAX_TEXT_BOX])
        font_type.append(conditions[i + MAX_TEXT_BOX * 2])

    # 2. input check
    styles = []
    if bg_prompt == "" or bg_prompt is None:
        raise gr.Error("Empty background prompt!")
    for i, (prompt, color, style) in enumerate(zip(prompts, colors, font_type)):
        if prompt == "" or prompt is None:
            raise gr.Error(f"Invalid prompt for text box {i + 1} !")
        if color is None:
            raise gr.Error(f"Invalid color for text box {i + 1} !")
        if style is None:
            raise gr.Error(f"Invalid style for text box {i + 1} !")
        bboxes.append(
            [
                stack_cp[i][0] / 1024,
                stack_cp[i][1] / 1024,
                (stack_cp[i][2] - stack_cp[i][0]) / 1024,
                (stack_cp[i][3] - stack_cp[i][1]) / 1024,
            ]
        )
        styles.append(
            {
                'color': webcolors.name_to_hex(color),
                'font-family': style,
            }
        )

    # 3. format input
    if bg_class != "" and bg_class is not None:
        bg_prompt = bg_class + ". " + bg_prompt
    if bg_tags != "" and bg_tags is not None:
        bg_prompt += " Tags: " + bg_tags
    text_prompt = prompt_format.format_prompt(prompts, styles)

    print(f"bg_prompt: {bg_prompt}")
    print(f"text_prompt: {text_prompt}")

    # 4. inference
    if seed == -1:
        generator = torch.Generator(device=device)
    else:
        generator = torch.Generator(device=device).manual_seed(int(seed))
    with torch.cuda.amp.autocast():
        image = pipeline(
            prompt=bg_prompt,
            text_prompt=text_prompt,
            texts=prompts,
            bboxes=bboxes,
            num_inference_steps=50,
            guidance_scale=cfg,
            generator=generator,
            text_attn_mask=None,
        ).images[0]

    flush()

    return image

@torch.inference_mode()
@spaces.GPU(enable_queue=True, duration=120)
def generate_image_multilingual(bg_prompt, bg_class, bg_tags, seed, cfg, *conditions):

    stack_cp = deepcopy(multilingual_stack)
    print(f"conditions: {conditions}")
    
    # 1. parse input
    prompts = []
    colors = []
    font_type = []
    langs = []
    bboxes = []
    num_boxes = len(stack_cp) if len(stack_cp[-1]) == 4 else len(stack_cp) - 1

    for i in range(num_boxes):
        if conditions[i + MAX_TEXT_BOX * 2] is None:
            raise gr.Error(f"Invalid conditions for box {i + 1} !")
    
    for i in range(num_boxes):
        prompts.append(conditions[i])
        colors.append(conditions[i + MAX_TEXT_BOX])
        lang = conditions[i + MAX_TEXT_BOX * 2].split(":")[0].strip()
        font = conditions[i + MAX_TEXT_BOX * 2].split(":")[1].strip()
        print(conditions[i + MAX_TEXT_BOX * 2], " ", lang, " ", font)
        langs.append(multilingual_reverse_code_dict[lang])
        font_type.append(f'{multilingual_reverse_code_dict[lang]}-{font}')

    # 2. input check
    styles = []
    if bg_prompt == "" or bg_prompt is None:
        raise gr.Error("Empty background prompt!")
    for i, (prompt, color, style) in enumerate(zip(prompts, colors, font_type)):
        if prompt == "" or prompt is None:
            raise gr.Error(f"Invalid prompt for text box {i + 1} !")
        if color is None:
            raise gr.Error(f"Invalid color for text box {i + 1} !")
        if style is None:
            raise gr.Error(f"Invalid style for text box {i + 1} !")

        bboxes.append(
            [
                stack_cp[i][0] / 1024,
                stack_cp[i][1] / 1024,
                (stack_cp[i][2] - stack_cp[i][0]) / 1024,
                (stack_cp[i][3] - stack_cp[i][1]) / 1024,
            ]
        )
        styles.append(
            {
                'color': webcolors.name_to_hex(color),
                'font-family': style,
            }
        )

    # 3. format input
    if bg_class != "" and bg_class is not None:
        bg_prompt = bg_class + ". " + bg_prompt
    if bg_tags != "" and bg_tags is not None:
        bg_prompt += " Tags: " + bg_tags
    text_prompt = multilingual_prompt_format.format_prompt(prompts, styles)

    print(f"bg_prompt: {bg_prompt}")
    print(f"text_prompt: {text_prompt}")

    # 4. inference
    if seed == -1:
        generator = torch.Generator(device=device)
    else:
        generator = torch.Generator(device=device).manual_seed(int(seed))
    with torch.cuda.amp.autocast():
        image = pipeline_multilingual(
            prompt=bg_prompt,
            text_prompt=text_prompt,
            texts=prompts,
            bboxes=bboxes,
            num_inference_steps=50,
            guidance_scale=cfg,
            generator=generator,
            text_attn_mask=None,
        ).images[0]

    flush()

    return image

def process_example(prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed, cfg):
    
    global stack, state
    
    colors = color_str.split(",")
    styles = style_str.split(";")
    boxes = box_str.split(";")
    prompts = text_str.split("**********")
    colors = [color.strip() for color in colors]
    styles = [style.strip() for style in styles]
    colors += [None] * (MAX_TEXT_BOX - len(colors))
    styles += [None] * (MAX_TEXT_BOX - len(styles))
    prompts += [""] * (MAX_TEXT_BOX - len(prompts))

    state = 0
    stack = []
    print(boxes)
    for box in boxes:
        print(box)
        box = box.strip()[1:-1]
        print(box)
        box = box.split(",")
        print(box)
        x = eval(box[0].strip()) * 1024
        y = eval(box[1].strip()) * 1024
        w = eval(box[2].strip()) * 1024
        h = eval(box[3].strip()) * 1024
        stack.append([int(x), int(y), int(x + w + 0.5), int(y + h + 0.5)])

    visibilities = []
    for _ in range(MAX_TEXT_BOX + 1):
        visibilities.append(gr.update(visible=False))
    for n in range(len(stack) + 1):
        visibilities[n] = gr.update(visible=True)

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i + 1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
            
            draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))

    return [
        gr.update(visible=True), box_sketch_template, seed, *visibilities, *colors, *styles, *prompts,
    ]

def process_example_multilingual(prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed, cfg):
    
    global multilingual_stack, multilingual_state
    
    colors = color_str.split(",")
    styles = style_str.split(";")
    print(styles)
    boxes = box_str.split(";")
    prompts = text_str.split("**********")
    colors = [color.strip() for color in colors]
    styles = [style.strip() for style in styles]
    colors += [None] * (MAX_TEXT_BOX - len(colors))
    styles += [None] * (MAX_TEXT_BOX - len(styles))
    prompts += [""] * (MAX_TEXT_BOX - len(prompts))

    multilingual_state = 0
    multilingual_stack = []
    print(boxes)
    for box in boxes:
        print(box)
        box = box.strip()[1:-1]
        print(box)
        box = box.split(",")
        print(box)
        x = eval(box[0].strip()) * 1024
        y = eval(box[1].strip()) * 1024
        w = eval(box[2].strip()) * 1024
        h = eval(box[3].strip()) * 1024
        multilingual_stack.append([int(x), int(y), int(x + w + 0.5), int(y + h + 0.5)])

    visibilities = []
    for _ in range(MAX_TEXT_BOX + 1):
        visibilities.append(gr.update(visible=False))
    for n in range(len(multilingual_stack) + 1):
        visibilities[n] = gr.update(visible=True)

    box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
    draw = ImageDraw.Draw(box_sketch_template)

    for i, text_position in enumerate(multilingual_stack):
        if len(text_position) == 2:
            x, y = text_position
            r = 4
            leftUpPoint = (x-r, y-r)
            rightDownPoint = (x+r, y+r)

            text_color = (255, 0, 0)  
            draw.text((x+2, y), str(i + 1), font=font, fill=text_color)

            draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
        elif len(text_position) == 4:
            x0, y0, x1, y1 = text_position
            x0, x1 = min(x0, x1), max(x0, x1)
            y0, y1 = min(y0, y1), max(y0, y1)
            r = 4
            leftUpPoint = (x0-r, y0-r)
            rightDownPoint = (x0+r, y0+r)

            text_color = (255, 0, 0)  
            draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
            
            draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))

    return [
        gr.update(visible=True), box_sketch_template, seed, *visibilities, *colors, *styles, *prompts,
    ]

def build_input_block(color_idx_list, font_idx_list, examples):

    with gr.Row():
        with gr.Column(elem_id="main-image"):
            box_sketch_template = gr.Image(
                value=Image.new('RGB', (1024, 1024), (255, 255, 255)), 
                sources=[],
                interactive=False,
            )

            box_sketch_template.select(get_pixels, [box_sketch_template], [box_sketch_template])

            with gr.Row():
                redo = gr.Button(value='Redo - Cancel last point') 
                undo = gr.Button(value='Undo - Clear the canvas') 
            redo.click(exe_redo, [box_sketch_template], [box_sketch_template])
            undo.click(exe_undo, [box_sketch_template], [box_sketch_template])

            button_layout = gr.Button("(1) I've finished my layout!", elem_id="main_button", interactive=True)

            prompts = []
            colors = []
            styles = []
            color_row = [None] * (MAX_TEXT_BOX + 1)
            with gr.Column(visible=False) as post_box:
                for n in range(MAX_TEXT_BOX + 1):
                    if n == 0 :
                        with gr.Row(visible=True) as color_row[n]:
                            bg_prompt = gr.Textbox(label="Design prompt of background", value="")
                            bg_class = gr.Textbox(label="Design type of background (optional)", value="")
                            bg_tags = gr.Textbox(label="Design type of the background (optional)", value="")
                    else:
                        with gr.Row(visible=False) as color_row[n]:
                            prompts.append(gr.Textbox(label="Prompt for box "+str(n)))
                            colors.append(gr.Dropdown(
                                label="Color for box "+str(n),
                                choices=color_idx_list,
                            ))
                            styles.append(gr.Dropdown(
                                label="Font type for box "+str(n),
                                choices=font_idx_list,
                            ))

                seed_ = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=-1, step=1)
                cfg_ = gr.Slider(label="CFG Scale", minimum=1, maximum=10, value=5)
                button_generate = gr.Button("(2) I've finished my texts, colors and styles, generate!", elem_id="main_button", interactive=True, variant='primary')

            button_layout.click(process_box, inputs=[], outputs=[post_box, *color_row])

        with gr.Column():
            output_image = gr.Image(label="Output Image", interactive=False)

        button_generate.click(generate_image, inputs=[bg_prompt, bg_class, bg_tags, seed_, cfg_, *(prompts + colors + styles)], outputs=[output_image], queue=True)

    with gr.Row():
        # examples
        color_str = gr.Textbox(label="Color list", value="", visible=False)
        style_str = gr.Textbox(label="Font type list", value="", visible=False)
        box_str = gr.Textbox(label="Bbox list", value="", visible=False)
        text_str = gr.Textbox(label="Text list", value="", visible=False)
        prev_img = gr.Image(label="Preview", visible = False)

        gr.Examples(
            examples=examples,
            inputs=[
                prev_img,
                bg_prompt,
                bg_class,
                bg_tags,
                color_str,
                style_str,
                text_str,
                box_str,
                seed_,
                cfg_
            ],
            outputs=[post_box, box_sketch_template, seed_, *color_row, *colors, *styles, *prompts],
            fn=process_example,
            cache_examples=False,
            run_on_click=True,
            label='Examples',
        )

def build_input_block_multilingual(color_idx_list, font_idx_list, examples):

    with gr.Row():
        with gr.Column(elem_id="main-image"):
            box_sketch_template = gr.Image(
                value=Image.new('RGB', (1024, 1024), (255, 255, 255)), 
                sources=[],
                interactive=False,
            )

            box_sketch_template.select(get_pixels_multilingual, [box_sketch_template], [box_sketch_template])

            with gr.Row():
                redo = gr.Button(value='Redo - Cancel last point') 
                undo = gr.Button(value='Undo - Clear the canvas') 
            redo.click(exe_redo_multilingual, [box_sketch_template], [box_sketch_template])
            undo.click(exe_undo_multilingual, [box_sketch_template], [box_sketch_template])

            button_layout = gr.Button("(1) I've finished my layout!", elem_id="main_button", interactive=True)

            prompts = []
            colors = []
            styles = []
            color_row = [None] * (MAX_TEXT_BOX + 1)
            with gr.Column(visible=False) as post_box:
                for n in range(MAX_TEXT_BOX + 1):
                    if n == 0 :
                        with gr.Row(visible=True) as color_row[n]:
                            bg_prompt = gr.Textbox(label="Design prompt of background", value="")
                            bg_class = gr.Textbox(label="Design type of background (optional)", value="")
                            bg_tags = gr.Textbox(label="Design type of the background (optional)", value="")
                    else:
                        with gr.Row(visible=False) as color_row[n]:
                            prompts.append(gr.Textbox(label="Prompt for box "+str(n)))
                            colors.append(gr.Dropdown(
                                label="Color for box "+str(n),
                                choices=color_idx_list,
                            ))
                            styles.append(gr.Dropdown(
                                label="Font type for box "+str(n),
                                choices=font_idx_list,
                            ))

                seed_ = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=-1, step=1)
                cfg_ = gr.Slider(label="CFG Scale", minimum=1, maximum=10, value=5)
                button_generate = gr.Button("(2) I've finished my texts, colors and styles, generate!", elem_id="main_button", interactive=True, variant='primary')

            button_layout.click(process_box_multilingual, inputs=[], outputs=[post_box, *color_row])

        with gr.Column():
            output_image = gr.Image(label="Output Image", interactive=False)

        button_generate.click(generate_image_multilingual, inputs=[bg_prompt, bg_class, bg_tags, seed_, cfg_, *(prompts + colors + styles)], outputs=[output_image], queue=True)

    with gr.Row():
        # examples
        color_str = gr.Textbox(label="Color list", value="", visible=False)
        style_str = gr.Textbox(label="Font type list", value="", visible=False)
        box_str = gr.Textbox(label="Bbox list", value="", visible=False)
        text_str = gr.Textbox(label="Text list", value="", visible=False)
        prev_img = gr.Image(label="Preview", visible = False)

        gr.Examples(
            examples=examples,
            inputs=[
                prev_img,
                bg_prompt,
                bg_class,
                bg_tags,
                color_str,
                style_str,
                text_str,
                box_str,
                seed_,
                cfg_
            ],
            outputs=[post_box, box_sketch_template, seed_, *color_row, *colors, *styles, *prompts],
            fn=process_example_multilingual,
            cache_examples=False,
            run_on_click=True,
            label='Examples',
        )

def main():

    init_pipeline()

    # load configs
    with open('assets/color_idx.json', 'r') as f:
        color_idx_dict = json.load(f)
        color_idx_list = list(color_idx_dict)
    with open('assets/font_idx_512.json', 'r') as f:
        eng_font_idx_dict = json.load(f)
        eng_font_idx_list = list(eng_font_idx_dict)
    multi_font_idx_list = []
    for lang in multilingual_font_dict:
        with open(f'assets/multi_fonts/{lang}.json', 'r') as f:
            lang_font_list = json.load(f)
        for font in lang_font_list:
            font_name = font[0][3:]
            multi_font_idx_list.append(f"{multilingual_code_dict[lang]}: {font_name}")

    html = f"""<h1>Glyph-ByT5: A Customized Text Encoder for Accurate Visual Text Rendering</h1>
            <h2><a href='https://glyph-byt5.github.io/'>Glyph-ByT5 Project Page</a> |<a href='https://glyph-byt5-v2.github.io/'>Glyph-ByT5-v2 Project Page</a> | <a href='https://arxiv.org/abs/2403.09622'>Glyph-ByT5 arXiv Paper</a> |<a href='https://arxiv.org/abs/2406.10208'>Glyph-ByT5-v2 arXiv Paper</a> | <a href='https://github.com/AIGText/Glyph-ByT5'>Github</a></h2>
            <p><b>We present a basic version of Glyph-SDXL, and a multilingual version Glyph-SDXL-v2 supporting up to 10 languages: English, Chinese, French, German, Spanish, Portuguese, Italian, Russian, Japanese and Korean.</b></p>
            <p><b>Note: due to limited capacity, we support 5000 chars in Chinese, 1148 chars in Japanese and 617 in Korean. Certain uncommon characters might not be supported for these three languages.</b></p>
            <p><b>Models presented in this demo are all based on albedo-xl!</b></p>
            <p><b>Try some examples at the bottom of the page to get started!</b></p>
            <p><b>Quick Guide:</b></p>
            <p>1. <b>Select bounding boxes</b> on the canvas on the left <b>by clicking twice</b>. </p>
            <p>2. Click "Redo" if you want to cancel last point, "Undo" for clearing the canvas. </p>
            <p>3. <b>Click "I've finished my layout!"</b> to start choosing specific prompts, colors and font-types. </p>
            <p>4. Enter a <b>design prompt</b> for the background image. Optionally, you can choose to specify the design categories and tags (separated by a comma). </p>
            <p>5. For each text box, <b>enter the text prompts in the text box</b> on the left, and <b>select colors and font-types from the drop boxes</b> on the right. </p>
            <p>6. <b>Click on "I've finished my texts, colors and styles, generate!"</b> to start generating!. </p>
            <style>.btn {{flex-grow: unset !important;}} </p>
            """

    css = '''
    #color-bg{display:flex;justify-content: center;align-items: center;}
    .color-bg-item{width: 100%; height: 32px}
    #main_button{width:100%}
    <style>
    '''

    eng_examples=[
        [
            'examples/easter.webp',
            'The image features a small bunny rabbit sitting in a basket filled with various flowers. The basket is placed on a yellow background, creating a vibrant and cheerful scene. The flowers surrounding the rabbit come in different sizes and colors, adding to the overall visual appeal of the image. The rabbit appears to be the main focus of the scene, and its presence among the flowers creates a sense of harmony and balance.',
            'Facebook Post',
            'green, yellow, minimalist, easter day, happy easter day, easter, happy easter, decoration, happy, egg, spring, selebration, poster, illustration, greeting, season, design, colorful, cute, template',
            'darkolivegreen, darkolivegreen, darkolivegreen',
            'Gagalin-Regular; Gagalin-Regular; Brusher-Regular',
            'MAY ALLYOUR PRAYERS BE ANSWERED**********HAVE A HAPPY**********Easter Day',
            '[0.08267477203647416, 0.5355623100303951, 0.42857142857142855, 0.07477203647416414]; [0.08389057750759879, 0.1951367781155015, 0.38054711246200607, 0.03768996960486322]; [0.07537993920972644, 0.2601823708206687, 0.49544072948328266, 0.14650455927051673]',
            1,
            5
        ],
        [
            'examples/shower.webp',
            'The image features a large gray elephant sitting in a field of flowers, holding a smaller elephant in its arms. The scene is quite serene and picturesque, with the two elephants being the main focus of the image. The field is filled with various flowers, creating a beautiful and vibrant backdrop for the elephants.',
            'Cards and invitations',
            'Light green, orange, Illustration, watercolor, playful, Baby shower invitation, baby boy shower invitation, baby boy, welcoming baby boy, koala baby shower invitation, baby shower invitation for baby shower, baby boy invitation, background, playful baby shower card, baby shower, card, newborn, born, Baby Shirt Baby Shower Invitation',
            'peru, olive, olivedrab, peru, peru, peru',
            'LilitaOne; Sensei-Medium; Sensei-Medium; LilitaOne; LilitaOne; LilitaOne',
            "RSVP to +123-456-7890**********Olivia Wilson**********Baby Shower**********Please Join Us For a**********In Honoring**********23 November, 2021 | 03:00 PM Fauget Hotels",
            '[0.07112462006079028, 0.6462006079027356, 0.3373860182370821, 0.026747720364741642]; [0.07051671732522796, 0.38662613981762917, 0.37264437689969604, 0.059574468085106386]; [0.07234042553191489, 0.15623100303951368, 0.6547112462006079, 0.12401215805471125]; [0.0662613981762918, 0.06747720364741641, 0.3981762917933131, 0.035866261398176294]; [0.07051671732522796, 0.31550151975683893, 0.22006079027355624, 0.03951367781155015]; [0.06990881458966565, 0.48328267477203646, 0.39878419452887537, 0.1094224924012158]',
            870745856,
            5
        ],
        [
            'examples/new_year.webp',
            'The image features a white background with a variety of colorful flowers and decorations. There are several pink flowers scattered throughout the scene, with some positioned closer to the top and others near the bottom. A blue flower can also be seen in the middle of the image. The overall composition creates a visually appealing and vibrant display.',
            'Instagram Posts',
            'grey, navy, purple, pink, teal, colorful, illustration, happy, celebration, post, party, year, new, event, celebrate, happy new year, new year, countdown, sparkle, firework',
            'purple, midnightblue, black, black',
            'Caveat-Regular; Gagalin-Regular; Quicksand-Light; Quicksand-Light',
            'Happy New Year**********2024**********All THE BEST**********A fresh start to start a change for the better.',
            '[0.2936170212765957, 0.2887537993920973, 0.40303951367781155, 0.07173252279635259]; [0.24984802431610942, 0.3951367781155015, 0.46200607902735563, 0.17203647416413373]; [0.3951367781155015, 0.1094224924012158, 0.2109422492401216, 0.02796352583586626]; [0.20911854103343466, 0.6127659574468085, 0.5586626139817629, 0.08085106382978724]',
            763905874,
            5
        ],
        [
            'examples/pancake.webp',
            'The image features a stack of pancakes with syrup and strawberries on top. The pancakes are arranged in a visually appealing manner, with some pancakes placed on top of each other. The syrup is drizzled generously over the pancakes, and the strawberries are scattered around, adding a touch of color and freshness to the scene. The overall presentation of the pancakes is appetizing and inviting.',
            'Instagram Posts',
            'brown, peach, grey, modern, minimalist, simple, colorful, illustration, Instagram post, instagram, post, national pancake day, international pancake day, happy pancake day, pancake day, pancake, sweet, cake, discount, sale',
            'dimgray, white, darkolivegreen',
            'MoreSugarRegular; Chewy-Regular; Chewy-Regular',
            'Get 75% Discount for your first order**********Order Now**********National Pancake Day',
            '[0.043161094224924014, 0.5963525835866261, 0.2936170212765957, 0.08389057750759879]; [0.12279635258358662, 0.79209726443769, 0.26382978723404255, 0.05167173252279635]; [0.044984802431610946, 0.09787234042553192, 0.4413373860182371, 0.4158054711246201]',
            1,
            5
        ]
    ]

    multi_examples=[
        [
            'examples/cake.webp',
            'The image features a delicious-looking chocolate cake with chocolate frosting. The cake is placed on a white plate, which is set on a blue tablecloth. The cake appears to be a celebration, possibly a birthday or anniversary, given the presence of a candle. The overall presentation of the cake is elegant and inviting.',
            '',
            '',
            'bisque, bisque, bisque',
            'Chinese: HelloFont-ID-DianHei-EEJ; Chinese: Hellofont-ID-QingHuaXingKai; Chinese: HelloFont-ID-LingLiTi',
            '生日快乐**********只愿你被这世界温柔相待**********妹妹',
            '[0.601823708206687, 0.5556231003039513, 0.35501519756838906, 0.08693009118541034]; [0.6261398176291794, 0.6723404255319149, 0.3252279635258359, 0.1270516717325228]; [0.6553191489361702, 0.4401215805471125, 0.23829787234042554, 0.11063829787234042]',
            7,
            5
        ],
        [
            'examples/xiaoman.webp',
            'The image portrays a young girl sitting on a large green leaf. The leaf is part of a plant with other green leaves. The girl is wearing a yellow dress and a straw hat. She is holding a small yellow flower in her hand. The background of the image is a light blue sky with a few clouds. The overall style of the image is a colorful, cartoon-like illustration.',
            '',
            '',
            'darkolivegreen, goldenrod, white, darkolivegreen, darkolivegreen',
            'Chinese: HYQiHei-AZEJ; English: TAN MERINGUE; Chinese: SourceHanSansSC-ExtraLight; Chinese: AlibabaPuHuiTi-Bold; English: SairaCondensed-Regular',
            '小满是二十四节气之一，夏季的第二个节气。该节气是指夏熟作物的籽粒开始灌浆饱满，但还未成熟，只是小满，还未大满。**********2022.5.21**********饱满的灵魂 无畏的生长 二十四节气之一**********今日小满**********Grain Buds',
            '[0.09969604863221884, 0.4370820668693009, 0.31124620060790276, 0.2072948328267477]; [ 0.10455927051671733, 0.09908814589665653, 0.22127659574468084, 0.034650455927051675]; [ 0.09969604863221884, 0.9398176291793313, 0.7993920972644377, 0.026747720364741642]; [ 0.09787234042553192, 0.17142857142857143, 0.4231003039513678, 0.10577507598784194]; [ 0.10091185410334347, 0.3100303951367781, 0.2772036474164134, 0.053495440729483285]',
            0,
            5
        ],
        [
            'examples/ski.webp',
            'The image depicts a winter sports scene. In the foreground, there is a person on a snowboard. The snowboarder is wearing a white jacket, black pants, and a black helmet with goggles. The snowboarder is in the process of performing a trick, with one hand extended and the other hand holding the snowboard.\nThe background of the image shows a snowy landscape with trees and a clear blue sky. The overall style of the image is a digital illustration with a cartoonish and colorful aesthetic.',
            '',
            '',
            'white, white',
            'Chinese: CanvaEndeavorBlackSC; Chinese: SourceHanSansSC-Light',
            '总要来一趟哈尔滨滑雪吧**********冰雪大世界',
            '[0.19696048632218846, 0.23829787234042554, 0.6054711246200608, 0.05592705167173252]; [0.19756838905775076, 0.09422492401215805, 0.6042553191489362, 0.1209726443768997]',
            1,
            5
        ],
        [
            'examples/song.webp',
            'The image features a cartoon of a fox character. The fox is standing on a stage with a microphone in front of it. The fox is wearing a pink shirt and is holding a bouquet of flowers in its left paw. The background of the image is a light pink color with a pattern of small flowers.',
            '',
            '',
            'coral',
            'Chinese: XianErTi',
            '世界儿歌日',
            '[0.08753799392097264, 0.11124620060790273, 0.8231003039513678, 0.22066869300911854]',
            1,
            5
        ],
        [
            'examples/festival.webp',
            'The image shows a nighttime cityscape with a dark sky filled with stars. The city is illuminated with various lights, suggesting a bustling urban environment. The image is framed by a black border, and there is a watermark or logo in the bottom right corner, which appears to be a stylized letter \'C\'. The overall style of the image is illustrative and colorful, with a focus on the contrast between the dark sky and the brightly lit city.',
            '',
            '',
            'white, white',
            'Japanese: MotoyaMinchoMiyabiStd-W4; Japanese: JackeyFont',
            '12月30日**********除夜を祝う',
            '[0.4121580547112462, 0.08145896656534954, 0.17386018237082068, 0.02006079027355623]; [0.33069908814589666, 0.29908814589665655, 0.34772036474164136, 0.31550151975683893]',
            42,
            5
        ],
        [
            'examples/woman.webp',
            'The image is a digital illustration featuring a character that appears to be a young woman with a serene expression. She is depicted with long, flowing hair and is wearing a traditional East Asian-style dress with a floral pattern. The dress is predominantly in shades of blue and green, with a hint of pink.\nThe character is seated on a bed of cherry blossoms, which are scattered around her. The blossoms are in full bloom, with their delicate pink petals and white stamens.\n\nThe background of the image is a pale, soft blue sky with a few wispy clouds. The overall atmosphere of the image is one of tranquility and serenity.',
            '',
            '',
            'saddlebrown, black, black, saddlebrown',
            'Korean: SeH-CB; Korean: SourceHanSerifSC-SemiBold; Korean: Canva_YoonGulimPro740; Korean: TDTDLatteOTF',
            '전문 메이크업 아티스트 아름다운 한복 무료 촬영**********행사 기간 5월 6일-5월 8일 행사 장소 상사호 고전 마을**********한복 동호회**********한복 체험 국조 문화 창작전',
            '[0.2674772036474164, 0.5465045592705167, 0.1264437689969605, 0.09787234042553192]; [0.2662613981762918, 0.3161094224924012, 0.17446808510638298, 0.15987841945288753]; [0.2650455927051672, 0.10395136778115502, 0.42613981762917935, 0.07598784194528875]; [0.26261398176291795, 0.20547112462006079, 0.3009118541033435, 0.041945288753799395]',
            317314747,
            5
        ],
        [
            'examples/elephant.webp',
            'The image features a large gray elephant sitting in a field of flowers, holding a smaller elephant in its arms. The scene is quite serene and picturesque, with the two elephants being the main focus of the image. The field is filled with various flowers, creating a beautiful and vibrant backdrop for the elephants.',
            'Cards and invitations',
            'Light green, orange, Illustration, watercolor, playful, Baby shower invitation, baby boy shower invitation, baby boy, welcoming baby boy, koala baby shower invitation, baby shower invitation for baby shower, baby boy invitation, background, playful baby shower card, baby shower, card, newborn, born, Baby Shirt Baby Shower Invitation',
            'peru, olive, olivedrab, peru, peru, peru',
            'Russian: TTRamillas-Italic; Russian: StadioNow-TextItalic; Russian: RubikOne-Regular; Russian: HeroLight-Regular; Russian: BebasNeueBold; Russian: SloopScriptPro-Regular',
            'Ответьте, пожалуйста, на номер +123-456-7890**********Оливия Уилсон**********Детский душ**********Пожалуйста, присоединитесь к нам для**********В ЧЕСТЬ**********23 ноября, 2021 | 15:00 Отели Фоже',
            '[0.07112462006079028, 0.6462006079027356, 0.3373860182370821, 0.026747720364741642]; [0.07051671732522796, 0.38662613981762917, 0.37264437689969604, 0.059574468085106386]; [0.07234042553191489, 0.15623100303951368, 0.6547112462006079, 0.12401215805471125]; [0.0662613981762918, 0.06747720364741641, 0.3981762917933131, 0.035866261398176294]; [0.07051671732522796, 0.31550151975683893, 0.22006079027355624, 0.03951367781155015]; [0.06990881458966565, 0.48328267477203646, 0.39878419452887537, 0.1094224924012158]',
            7,
            5
        ],
        [
            'examples/earth.webp',
            'The image features a green and blue globe with a factory on top of it. The factory is surrounded by trees, giving the impression of a harmonious coexistence between the industrial structure and the natural environment. The globe is prominently displayed in the center of the image, with the factory and trees surrounding it.',
            'Posters',
            'green, modern, earth, world, planet, ecology, background, globe, environment, day, space, map, concept, global, light, hour, energy, power, protect, illustration',
            'white, white',
            'Portuguese: Gliker-Regular; Portuguese: Amsterdam-Three',
            'A TERRA É O QUE TODOS NÓS TEMOS EM COMUM**********Dia da Terra',
            '[0.2875379939209726, 0.2753799392097264, 0.4243161094224924, 0.060790273556231005]; [ 0.2978723404255319, 0.16170212765957448, 0.40364741641337387, 0.10638297872340426]',
            1208360201,
            5
        ],
    ]
    
    with gr.Blocks(
        title="Glyph-ByT5: A Customized Text Encoder for Accurate Visual Text Rendering",
        css=css,
    ) as demo:

        gr.HTML(html)
        with gr.Tab("Multilingual") as multi_tab:
            build_input_block_multilingual(color_idx_list, multi_font_idx_list, multi_examples)

        with gr.Tab("English") as eng_tab:
            build_input_block(color_idx_list, eng_font_idx_list, eng_examples)

    demo.queue()
    demo.launch()

if __name__ == "__main__":
    main()