File size: 5,090 Bytes
353fa54
c3a1897
 
 
 
 
 
eb902b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44a0c32
 
 
eb902b3
 
 
 
 
 
 
 
 
 
353fa54
c3a1897
 
 
 
 
353fa54
c3a1897
 
 
 
 
eb902b3
 
44a0c32
 
 
 
 
c3a1897
 
44a0c32
c3a1897
 
44a0c32
 
c3a1897
 
44a0c32
 
 
 
 
 
 
 
 
 
c3a1897
 
 
44a0c32
 
 
 
 
 
 
 
 
 
c3a1897
 
eb902b3
c3a1897
 
 
eb902b3
44a0c32
c3a1897
 
 
 
 
 
 
40adb4f
eb902b3
 
 
44a0c32
eb902b3
 
c3a1897
 
 
 
44a0c32
 
 
c3a1897
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
from models.image_text_transformation import ImageTextTransformation
import argparse
import torch

parser = argparse.ArgumentParser()
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')

args = parser.parse_args()

# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

if device == "cuda":
    args.image_caption_device = "cuda"
    args.dense_caption_device = "cuda"
    args.semantic_segment_device = "cuda"
    args.contolnet_device = "cuda"
else:
    args.image_caption_device = "cpu"
    args.dense_caption_device = "cpu"
    args.semantic_segment_device = "cpu"
    args.contolnet_device = "cpu"

def pil_image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str

def add_logo():
    with open("examples/logo.png", "rb") as f:
        logo_base64 = base64.b64encode(f.read()).decode()
    return logo_base64

def process_image(image_src, options, processor):
    processor.args.semantic_segment = "Semantic Segment" in options
    image_generation_status = "Image Generation" in options
    image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
    if image_generation_status:
        gen_image = processor.text_to_image(gen_text)
        gen_image_str = pil_image_to_base64(gen_image)
    # Combine the outputs into a single HTML output
    custom_output = f'''
    <h2>Image->Text:</h2>
    <div style="display: flex; flex-wrap: wrap;">
        <div style="flex: 1;">
            <h3>Image Caption</h3>
            <p>{image_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Dense Caption</h3>
            <p>{dense_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Region Semantic</h3>
            <p>{region_semantic}</p>
        </div>
        <div style="flex: 1;">
            <h3>GPT4 Reasoning:</h3>
            <p>{gen_text}</p>
        </div>
    </div>
    '''
    if image_generation_status:
        custom_output += f'''
        <h2>Text->Image:</h2>
        <div style="display: flex; flex-wrap: wrap;">
            <div style="flex: 1;">
                <h3>Generated Image</h3>
                <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
            </div>
        </div>
        '''
    return custom_output

processor = ImageTextTransformation(args)

# Create Gradio input and output components
image_input = gr.inputs.Image(type='filepath', label="Input Image")
semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)

logo_base64 = add_logo()
# Create the title with the logo
title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'

# Create Gradio interface
interface = gr.Interface(
    fn=lambda image, options: process_image(image, options, processor),
    inputs=[image_input,        
            gr.CheckboxGroup(
            label="Options",
            choices=["Semantic Segment", "Image Generation"],
            ),
            ],
    outputs=gr.outputs.HTML(),
    title=title_with_logo,
    description="""
    This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
    \n Since GPU is expensive, we use CPU for demo. Run code local with gpu or google colab we provided for fast speed.
    \n Semantic segment is very slow in cpu(~8m).
    \n Ttext2image model is controlnet is also very slow in cpu(~2m), which used canny edge as reference.
    """
)

# Launch the interface
interface.launch()