import gradio as gr import torch import spaces from src.models.transformer_sd3_SiamLayout import SiamLayoutSD3Transformer2DModel from src.pipeline.pipeline_CreatiLayout import CreatiLayoutSD3Pipeline from utils.bbox_visualization import bbox_visualization,scale_boxes from PIL import Image import os import pandas as pd from huggingface_hub import login hf_token = os.getenv("HF_TOKEN") if hf_token is None: raise ValueError("Hugging Face token not found. Please set the HF_TOKEN secret.") login(token=hf_token) model_path = "stabilityai/stable-diffusion-3-medium-diffusers" ckpt_path = "Benson1237/CreatiLayout" transformer_additional_kwargs = dict(attention_type="layout",strict=True) transformer = SiamLayoutSD3Transformer2DModel.from_pretrained( ckpt_path, subfolder="transformer", torch_dtype=torch.float16,**transformer_additional_kwargs) pipe = CreatiLayoutSD3Pipeline.from_pretrained(model_path, transformer=transformer, torch_dtype=torch.float16) pipe = pipe.to("cuda") print("pipeline is loaded.") @spaces.GPU def process_image_and_text(global_caption, box_detail_phrases_list:pd.DataFrame, boxes:pd.DataFrame,seed: int=42, randomize_seed: bool=False, guidance_scale: float=7.5, num_inference_steps: int=50): if randomize_seed: seed = torch.randint(0, 100, (1,)).item() height = 1024 width = 1024 box_detail_phrases_list_tmp = box_detail_phrases_list.values.tolist() box_detail_phrases_list_tmp = [c[0] for c in box_detail_phrases_list_tmp] boxes = boxes.astype(float).values.tolist() white_image = Image.new('RGB', (width, height), color='rgb(256,256,256)') show_input = {"boxes":scale_boxes(boxes,width,height),"labels":box_detail_phrases_list_tmp} bbox_visualization_img = bbox_visualization(white_image,show_input) result_img = pipe( prompt=global_caption, generator=torch.Generator(device="cuda").manual_seed(seed), guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, bbox_phrases=box_detail_phrases_list_tmp, bbox_raw=boxes, height=height, width=width ).images[0] return bbox_visualization_img, result_img def get_samples(): sample_list = [ { "global_caption": "A picturesque scene features Iron Man standing confidently on a rugged rock by the sea, holding a drawing board with his hands. The board displays the words 'Creative Layout' in a playful, hand-drawn font. The serene sea shimmers under the setting sun. The sky is painted with a gradient of warm colors, from deep oranges to soft purples.", "region_caption_list": [ "Iron Man standing confidently on a rugged rock.", "A rugged rock by the sea.", "A drawing board with the words \"Creative Layout\" in a playful, hand-drawn font.", "The serene sea shimmers under the setting sun.", "The sky is a shade of deep orange to soft purple." ], "region_bboxes_list": [ [0.40, 0.35, 0.55, 0.80], [0.35, 0.75, 0.60, 0.95], [0.40, 0.45, 0.55, 0.65], [0.00, 0.30, 1.00, 0.90], [0.00, 0.00, 1.00, 0.30] ] }, { "global_caption": "This is a photo showcasing two wooden benches in a park. The bench on the left is painted in a vibrant blue, while the one on the right is painted in a green. Both are placed on a path paved with stones, surrounded by lush trees and shrubs. The sunlight filters through the leaves, casting dappled shadows on the ground, creating a tranquil and comfortable atmosphere.", "region_caption_list": [ "A weathered, blue wooden bench with green elements in a natural setting.", "Old, weathered wooden benches with green and blue paint.", "A dirt path in a park with green grass on the sides and two colorful wooden benches.", "Thick, verdant foliage of mature trees in a dense forest." ], "region_bboxes_list": [ [0.30, 0.44, 0.62, 0.78], [0.54, 0.41, 0.75, 0.65], [0.00, 0.39, 1.00, 1.00], [0.00, 0.00, 1.00, 0.43] ] }, { "global_caption": "This is a wedding photo taken in a photography studio, showing a newlywed couple sitting on a brown leather sofa in a modern indoor setting. The groom is dressed in a pink suit, paired with a pink tie and white shirt, while the bride is wearing a white wedding dress with a long veil. They are sitting on a brown leather sofa, with a wooden table in front of them, on which a bouquet of flowers is placed. The background is a bar with a staircase and a wall decorated with lights, creating a warm and romantic atmosphere.", "region_caption_list": [ "A floral arrangement consisting of roses, carnations, and eucalyptus leaves on a wooden surface.", "A white wedding dress with off-the-shoulder ruffles and a long, sheer veil.", "A polished wooden table with visible grain and knots.", "A close-up of a dark brown leather sofa with tufted upholstery and button details.", "A man in a pink suit with a white shirt and red tie, sitting on a leather armchair.", "A person in a suit seated on a leather armchair near a wooden staircase with books and bottles.", "Bride in white gown with veil, groom in maroon suit and pink tie, seated on leather armchairs." ], "region_bboxes_list": [ [0.09, 0.65, 0.31, 0.93], [0.62, 0.25, 0.89, 0.90], [0.01, 0.70, 0.78, 0.99], [0.76, 0.65, 1.00, 0.99], [0.27, 0.32, 0.72, 0.75], [0.00, 0.01, 0.52, 0.72], [0.27, 0.09, 0.94, 0.89] ] } ] return [[sample["global_caption"], [[caption] for caption in sample["region_caption_list"]], sample["region_bboxes_list"]] for sample in sample_list] with gr.Blocks() as demo: gr.Markdown("# CreatiLayout / Layout-to-Image generation") with gr.Row(): with gr.Column(): global_caption = gr.Textbox(lines=2, label="Global Caption") box_detail_phrases_list = gr.Dataframe(headers=["Region Captions"], label="Region Captions") boxes = gr.Dataframe(headers=["x1", "y1", "x2", "y2"], label="Region Bounding Boxes (x_min,y_min,x_max,y_max)") with gr.Accordion("Advanced Settings", open=False): seed = gr.Slider(0, 100, step=1, label="Seed", value=42) randomize_seed = gr.Checkbox(label="Randomize seed", value=False) guidance_scale = gr.Slider(1, 30, step=0.5, label="Guidance Scale", value=7.5) num_inference_steps = gr.Slider(1, 50, step=1, label="Number of inference steps", value=50) with gr.Column(): bbox_visualization_img = gr.Image(type="pil", label="Bounding Box Visualization") with gr.Column(): output_image = gr.Image(type="pil", label="Generated Image") gr.Button("Generate").click( fn=process_image_and_text, inputs=[global_caption, box_detail_phrases_list, boxes, seed, randomize_seed, guidance_scale, num_inference_steps], outputs=[bbox_visualization_img, output_image] ) gr.Examples( examples=get_samples(), inputs=[global_caption, box_detail_phrases_list, boxes], outputs=[bbox_visualization_img, output_image], fn=process_image_and_text, cache_examples=True ) if __name__ == "__main__": demo.launch()