File size: 4,853 Bytes
1d42b83
 
 
 
 
 
 
 
 
 
 
 
c431f44
9d3974d
c431f44
9d3974d
 
c431f44
 
 
 
d65dd81
6193207
 
d65dd81
f2977fa
5b067c5
5ab0166
5b067c5
 
c431f44
 
5b067c5
c431f44
 
 
 
 
 
5b067c5
c431f44
 
1d42b83
 
c431f44
1d42b83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51aabb2
1d42b83
c431f44
51aabb2
c431f44
5b067c5
51aabb2
5b067c5
1d42b83
 
c431f44
51aabb2
1d42b83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c431f44
 
 
 
 
 
 
 
5b067c5
 
 
 
 
 
 
 
 
1d42b83
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import string
import gradio as gr
import requests
import torch


from transformers import BlipForQuestionAnswering, BlipProcessor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)

from transformers import BlipProcessor, BlipForConditionalGeneration

cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")



def caption(input_image):
    inputs = cap_processor(input_image, return_tensors="pt")
    inputs["num_beams"] = 1
    inputs['num_return_sequences'] =1
    out = cap_model.generate(**inputs)
    return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True))
import openai
openai.api_key="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"
def gpt3(question,vqa_answer,caption):
    prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
    response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=prompt,
    max_tokens=10,
    n=1,
    stop=None,
    temperature=0.7,
    )
    answer = response.choices[0].text.strip()
    return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer

    
def inference_chat(input_image,input_text):
    inputs = processor(images=input_image, text=input_text,return_tensors="pt")
    inputs["max_length"] = 10
    inputs["num_beams"] = 5
    inputs['num_return_sequences'] =4
    out = model_vqa.generate(**inputs)
    return "\n".join(processor.batch_decode(out, skip_special_tokens=True))
    
with gr.Blocks(
    css="""
    .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
    #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
    """
) as iface:
    state = gr.State([])
    #caption_output = None
    #gr.Markdown(title)
    #gr.Markdown(description)
    #gr.Markdown(article)

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(type="pil")
            with gr.Row():
                with gr.Column(scale=1):
                    chat_input = gr.Textbox(lines=1, label="VQA Input(问题输入)")
                    with gr.Row():
                        clear_button = gr.Button(value="Clear", interactive=True)
                        submit_button = gr.Button(
                            value="Submit_VQA", interactive=True, variant="primary"
                        )
                    cap_submit_button = gr.Button(
                            value="Submit_CAP", interactive=True, variant="primary"
                        )
                    gpt3_submit_button = gr.Button(
                            value="Submit_GPT3", interactive=True, variant="primary"
                        )
        with gr.Column():
            caption_output = gr.Textbox(lines=0, label="VQA Output(模型答案输出)")
            caption_output_v1 = gr.Textbox(lines=0, label="Caption Output(模型caption输出)")
            gpt3_output_v1 = gr.Textbox(lines=0, label="GPT3 Output(GPT3输出)")
            
        image_input.change(
            lambda: ("", "", []),
            [],
            [ caption_output, state],
            queue=False,
        )
        chat_input.submit(
                    inference_chat,
                    [
                        image_input,
                        chat_input,
                    ],
                    [ caption_output],
                )
        clear_button.click(
                        lambda: ("", [], []),
                        [],
                        [chat_input,  state],
                        queue=False,
                    )
        submit_button.click(
                        inference_chat,
                        [
                            image_input,
                            chat_input,
                        ],
                        [caption_output],
                    )
        cap_submit_button.click(
                        caption,
                        [
                            image_input,
                   
                        ],
                        [caption_output_v1],
                    )
        gpt3_submit_button.click(
                        gpt3,
                        [
                            chat_input,
                           caption_output ,
                            caption_output_v1,
                        ],
                        [gpt3_output_v1],
                    )

   # examples = gr.Examples(
   #     examples=examples,
   #     inputs=[image_input, chat_input],
  #  )

iface.queue(concurrency_count=1, api_open=False, max_size=10)
iface.launch(enable_queue=True)