Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -554,7 +554,21 @@ focus_map = {
|
|
554 |
"Judge":3
|
555 |
}
|
556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
prompt_list = [
|
559 |
[
|
560 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
@@ -720,7 +734,6 @@ def init_wo_openai_api_key():
|
|
720 |
# return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
721 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
|
722 |
|
723 |
-
|
724 |
def get_click_prompt(chat_input, click_state, click_mode):
|
725 |
inputs = json.loads(chat_input)
|
726 |
if click_mode == 'Continuous':
|
@@ -783,10 +796,10 @@ async def chat_input_callback(*args):
|
|
783 |
audio = await texttospeech(response,language,autoplay,gender)
|
784 |
return state, state, None, audio
|
785 |
|
|
|
786 |
|
787 |
def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
|
788 |
print("narritive", narritive)
|
789 |
-
|
790 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
791 |
image_input = image_input['background']
|
792 |
|
@@ -818,6 +831,11 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
818 |
image_input.save(new_image_path)
|
819 |
visual_chatgpt.current_image = new_image_path
|
820 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
|
|
|
|
|
|
|
|
|
|
821 |
print("memory",visual_chatgpt.agent.memory)
|
822 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
823 |
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
|
@@ -883,7 +901,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
883 |
|
884 |
def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
|
885 |
length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
886 |
-
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
887 |
click_index = evt.index
|
888 |
|
889 |
if point_prompt == 'Positive':
|
@@ -915,15 +933,20 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
915 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
916 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
917 |
# state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
|
918 |
-
log_state= log_state + [("Selected image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
|
919 |
-
# deviding line
|
920 |
-
log_state= log_state + [("/////", None)]
|
921 |
-
|
922 |
-
if prompt["input_label"][-1]=="0":
|
923 |
-
state = state + [("You've added area at {}, ".format(prompt["input_point"]), None)]
|
924 |
-
else:
|
925 |
-
state = state + [("You've removed area at {}, ".format(prompt["input_point"]), None)]
|
926 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
927 |
input_mask = np.array(out['mask'].convert('P'))
|
928 |
image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
|
929 |
|
@@ -934,11 +957,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
934 |
out_state = out
|
935 |
|
936 |
if visual_chatgpt is not None:
|
|
|
937 |
new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
|
938 |
Image.open(out["crop_save_path"]).save(new_crop_save_path)
|
939 |
-
|
|
|
|
|
|
|
|
|
940 |
|
941 |
-
yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
942 |
|
943 |
|
944 |
query_focus = {
|
@@ -948,10 +976,6 @@ query_focus = {
|
|
948 |
"Judge": "Evaluate the item."
|
949 |
}
|
950 |
|
951 |
-
def generate_action(focus_type):
|
952 |
-
if focus_type == "D":
|
953 |
-
print()
|
954 |
-
|
955 |
|
956 |
async def submit_caption(naritive, state,length, sentiment, factuality, language,
|
957 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
@@ -962,17 +986,23 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
|
|
962 |
|
963 |
click_index = click_index_state
|
964 |
|
|
|
|
|
|
|
|
|
|
|
965 |
print("click_index",click_index)
|
966 |
print("input_points_state",input_points_state)
|
967 |
print("input_labels_state",input_labels_state)
|
968 |
|
969 |
prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
|
970 |
|
971 |
-
|
972 |
-
|
973 |
print("Prompt:", prompt)
|
974 |
print("click",click_index)
|
975 |
|
|
|
|
|
|
|
976 |
|
977 |
# if not args.disable_gpt and text_refiner:
|
978 |
if not args.disable_gpt:
|
@@ -998,6 +1028,8 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
|
|
998 |
print("error gpt responese")
|
999 |
print("item gender",gender)
|
1000 |
|
|
|
|
|
1001 |
try:
|
1002 |
if autoplay==False:
|
1003 |
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
|
@@ -1550,13 +1582,7 @@ def create_ui():
|
|
1550 |
css=css,
|
1551 |
theme=gr.themes.Base()
|
1552 |
) as iface:
|
1553 |
-
#display in the chatbox
|
1554 |
state = gr.State([])
|
1555 |
-
# expoer in log
|
1556 |
-
log_state=gr.State([])
|
1557 |
-
# history log for gpt
|
1558 |
-
history_log=gr.State([])
|
1559 |
-
|
1560 |
out_state = gr.State(None)
|
1561 |
click_state = gr.State([[], [], []])
|
1562 |
origin_image = gr.State(None)
|
@@ -1582,17 +1608,38 @@ def create_ui():
|
|
1582 |
log_list=gr.State([])
|
1583 |
gender=gr.State('female')
|
1584 |
image_path=gr.State('')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1585 |
|
1586 |
-
with gr.Row(visible=False, elem_id="top_row") as top_row:
|
1587 |
-
task = gr.Dropdown(
|
1588 |
-
["Session 1: task-based interaction","Session 2: Free-will interaction"],
|
1589 |
-
value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
|
1590 |
-
)
|
1591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1592 |
with gr.Row():
|
1593 |
with gr.Column(scale=6):
|
1594 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1595 |
-
with gr.Tab("Base(GPT Power)"
|
1596 |
image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1597 |
with gr.Row():
|
1598 |
name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
|
@@ -1600,7 +1647,7 @@ def create_ui():
|
|
1600 |
year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1601 |
material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
|
1602 |
|
1603 |
-
with gr.Tab("Base2"
|
1604 |
image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1605 |
with gr.Row():
|
1606 |
name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
|
@@ -1626,9 +1673,10 @@ def create_ui():
|
|
1626 |
add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
|
1627 |
minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
|
1628 |
clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
|
1629 |
-
|
1630 |
-
|
1631 |
-
|
|
|
1632 |
focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
|
1633 |
|
1634 |
recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
|
@@ -1698,7 +1746,7 @@ def create_ui():
|
|
1698 |
|
1699 |
with gr.Column(visible=False) as recommend:
|
1700 |
gallery_result = gr.Gallery(
|
1701 |
-
label="
|
1702 |
height="auto",
|
1703 |
columns=4
|
1704 |
# columns=4,
|
@@ -1742,12 +1790,10 @@ def create_ui():
|
|
1742 |
paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
|
1743 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1744 |
|
1745 |
-
|
1746 |
-
|
1747 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1748 |
with gr.Blocks():
|
1749 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
|
1750 |
-
with gr.Column() as modules_need_gpt3:
|
1751 |
chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
|
1752 |
with gr.Row():
|
1753 |
clear_button_text = gr.Button(value="Clear Chat", interactive=True)
|
@@ -1762,6 +1808,7 @@ def create_ui():
|
|
1762 |
label="narritive",
|
1763 |
scale=5,
|
1764 |
interactive=True)
|
|
|
1765 |
|
1766 |
# TTS interface hidden initially
|
1767 |
with gr.Column(visible=False) as tts_interface:
|
@@ -1876,13 +1923,9 @@ def create_ui():
|
|
1876 |
# )
|
1877 |
|
1878 |
with gr.Row():
|
|
|
1879 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
1880 |
-
|
1881 |
-
with gr.Row(elem_id="top_row") as top_row:
|
1882 |
-
task = gr.Dropdown(
|
1883 |
-
["Session 1: task-based interaction","Session 2: Free-will interaction"],
|
1884 |
-
value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
|
1885 |
-
)
|
1886 |
language = gr.Dropdown(
|
1887 |
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1888 |
value="English", label="Language", interactive=True, elem_classes="custom-language"
|
@@ -2050,7 +2093,7 @@ def create_ui():
|
|
2050 |
|
2051 |
# mv_images = gr.State()
|
2052 |
|
2053 |
-
chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
|
2054 |
|
2055 |
# submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
2056 |
# fn=generate_mvs,
|
@@ -2107,6 +2150,13 @@ def create_ui():
|
|
2107 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
2108 |
modules_not_need_gpt,
|
2109 |
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2110 |
|
2111 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
2112 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
@@ -2146,23 +2196,23 @@ def create_ui():
|
|
2146 |
)
|
2147 |
|
2148 |
|
2149 |
-
|
2150 |
-
|
2151 |
-
|
2152 |
-
|
2153 |
-
|
2154 |
-
|
2155 |
-
|
2156 |
-
|
2157 |
-
|
2158 |
-
|
2159 |
-
|
2160 |
-
|
2161 |
-
|
2162 |
-
|
2163 |
|
2164 |
-
|
2165 |
-
|
2166 |
|
2167 |
clear_button_click.click(
|
2168 |
lambda x: ([[], [], []], x),
|
@@ -2172,14 +2222,14 @@ def create_ui():
|
|
2172 |
show_progress=False
|
2173 |
)
|
2174 |
clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
|
2175 |
-
|
2176 |
-
|
2177 |
-
|
2178 |
-
|
2179 |
-
|
2180 |
-
|
2181 |
-
|
2182 |
-
|
2183 |
clear_button_text.click(
|
2184 |
lambda: ([], [], [[], [], [], []]),
|
2185 |
[],
|
@@ -2192,7 +2242,7 @@ def create_ui():
|
|
2192 |
image_input.clear(
|
2193 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2194 |
[],
|
2195 |
-
[image_input, chatbot, state, click_state,
|
2196 |
queue=False,
|
2197 |
show_progress=False
|
2198 |
)
|
@@ -2202,17 +2252,17 @@ def create_ui():
|
|
2202 |
|
2203 |
|
2204 |
|
2205 |
-
|
2206 |
-
|
2207 |
-
|
2208 |
-
|
2209 |
-
|
2210 |
|
2211 |
-
|
2212 |
-
|
2213 |
-
|
2214 |
-
|
2215 |
-
|
2216 |
|
2217 |
image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
|
2218 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
@@ -2267,10 +2317,10 @@ def create_ui():
|
|
2267 |
return [gr.update(visible=False)]*4
|
2268 |
|
2269 |
|
2270 |
-
|
2271 |
-
|
2272 |
-
|
2273 |
-
|
2274 |
|
2275 |
|
2276 |
|
@@ -2280,9 +2330,9 @@ def create_ui():
|
|
2280 |
inputs=[
|
2281 |
origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
|
2282 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
2283 |
-
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2284 |
],
|
2285 |
-
outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
2286 |
show_progress=False, queue=True
|
2287 |
)
|
2288 |
|
@@ -2389,7 +2439,7 @@ def create_ui():
|
|
2389 |
naritive.change(
|
2390 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2391 |
[],
|
2392 |
-
[image_input, chatbot, state, click_state,
|
2393 |
queue=False,
|
2394 |
show_progress=False
|
2395 |
|
|
|
554 |
"Judge":3
|
555 |
}
|
556 |
|
557 |
+
'''
|
558 |
+
prompt_list = [
|
559 |
+
'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
560 |
+
'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
|
561 |
+
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
|
562 |
+
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
|
563 |
+
]
|
564 |
|
565 |
+
prompt_list = [
|
566 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
567 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
568 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
569 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
|
570 |
+
]
|
571 |
+
'''
|
572 |
prompt_list = [
|
573 |
[
|
574 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
|
|
734 |
# return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
735 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
|
736 |
|
|
|
737 |
def get_click_prompt(chat_input, click_state, click_mode):
|
738 |
inputs = json.loads(chat_input)
|
739 |
if click_mode == 'Continuous':
|
|
|
796 |
audio = await texttospeech(response,language,autoplay,gender)
|
797 |
return state, state, None, audio
|
798 |
|
799 |
+
|
800 |
|
801 |
def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
|
802 |
print("narritive", narritive)
|
|
|
803 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
804 |
image_input = image_input['background']
|
805 |
|
|
|
831 |
image_input.save(new_image_path)
|
832 |
visual_chatgpt.current_image = new_image_path
|
833 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
834 |
+
# img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
|
835 |
+
Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
836 |
+
AI_prompt = "Received."
|
837 |
+
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
838 |
+
visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
839 |
print("memory",visual_chatgpt.agent.memory)
|
840 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
841 |
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
|
|
|
901 |
|
902 |
def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
|
903 |
length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
904 |
+
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, evt: gr.SelectData):
|
905 |
click_index = evt.index
|
906 |
|
907 |
if point_prompt == 'Positive':
|
|
|
933 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
934 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
935 |
# state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
936 |
|
937 |
+
# state = state + [("Selected image point: {}, Input label: {}".format(
|
938 |
+
# prompt["input_point"],
|
939 |
+
# '+' if prompt["input_label"] == "1" else '-'
|
940 |
+
# ), None)]
|
941 |
+
|
942 |
+
output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
|
943 |
+
|
944 |
+
state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
|
945 |
+
|
946 |
+
|
947 |
+
|
948 |
+
# update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
|
949 |
+
text = out['generated_captions']['raw_caption']
|
950 |
input_mask = np.array(out['mask'].convert('P'))
|
951 |
image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
|
952 |
|
|
|
957 |
out_state = out
|
958 |
|
959 |
if visual_chatgpt is not None:
|
960 |
+
print('inference_click: add caption to chatGPT memory')
|
961 |
new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
|
962 |
Image.open(out["crop_save_path"]).save(new_crop_save_path)
|
963 |
+
point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
|
964 |
+
visual_chatgpt.point_prompt = point_prompt
|
965 |
+
|
966 |
+
|
967 |
+
print("new crop save",new_crop_save_path)
|
968 |
|
969 |
+
yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
970 |
|
971 |
|
972 |
query_focus = {
|
|
|
976 |
"Judge": "Evaluate the item."
|
977 |
}
|
978 |
|
|
|
|
|
|
|
|
|
979 |
|
980 |
async def submit_caption(naritive, state,length, sentiment, factuality, language,
|
981 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
|
|
986 |
|
987 |
click_index = click_index_state
|
988 |
|
989 |
+
# if pre_click_index==click_index:
|
990 |
+
# click_index = (click_index[0] - 1, click_index[1] - 1)
|
991 |
+
# pre_click_index = click_index
|
992 |
+
# else:
|
993 |
+
# pre_click_index = click_index
|
994 |
print("click_index",click_index)
|
995 |
print("input_points_state",input_points_state)
|
996 |
print("input_labels_state",input_labels_state)
|
997 |
|
998 |
prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
|
999 |
|
|
|
|
|
1000 |
print("Prompt:", prompt)
|
1001 |
print("click",click_index)
|
1002 |
|
1003 |
+
# image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
|
1004 |
+
# input_points=input_points, input_labels=input_labels)
|
1005 |
+
|
1006 |
|
1007 |
# if not args.disable_gpt and text_refiner:
|
1008 |
if not args.disable_gpt:
|
|
|
1028 |
print("error gpt responese")
|
1029 |
print("item gender",gender)
|
1030 |
|
1031 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1032 |
+
# input_points=input_points, input_labels=input_labels)
|
1033 |
try:
|
1034 |
if autoplay==False:
|
1035 |
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
|
|
|
1582 |
css=css,
|
1583 |
theme=gr.themes.Base()
|
1584 |
) as iface:
|
|
|
1585 |
state = gr.State([])
|
|
|
|
|
|
|
|
|
|
|
1586 |
out_state = gr.State(None)
|
1587 |
click_state = gr.State([[], [], []])
|
1588 |
origin_image = gr.State(None)
|
|
|
1608 |
log_list=gr.State([])
|
1609 |
gender=gr.State('female')
|
1610 |
image_path=gr.State('')
|
1611 |
+
# with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
|
1612 |
+
# with gr.Column(scale=0.5):
|
1613 |
+
# # gr.Markdown("Left side content")
|
1614 |
+
|
1615 |
+
# with gr.Column(scale=0.5):
|
1616 |
+
# with gr.Row(align="right",visible=False) as language_select:
|
1617 |
+
# language = gr.Dropdown(
|
1618 |
+
# ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1619 |
+
# value="English", label="Language", interactive=True)
|
1620 |
+
|
1621 |
+
# with gr.Row(align="right",visible=False) as autoplay:
|
1622 |
+
# auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1623 |
+
# output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1624 |
|
|
|
|
|
|
|
|
|
|
|
1625 |
|
1626 |
+
|
1627 |
+
|
1628 |
+
|
1629 |
+
|
1630 |
+
# with gr.Row(align="right",visible=False) as language_select:
|
1631 |
+
# language = gr.Dropdown(
|
1632 |
+
# ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1633 |
+
# value="English", label="Language", interactive=True)
|
1634 |
+
|
1635 |
+
# with gr.Row(align="right",visible=False) as autoplay:
|
1636 |
+
# auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1637 |
+
# output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1638 |
+
|
1639 |
with gr.Row():
|
1640 |
with gr.Column(scale=6):
|
1641 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1642 |
+
with gr.Tab("Base(GPT Power)") as base_tab:
|
1643 |
image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1644 |
with gr.Row():
|
1645 |
name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
|
|
|
1647 |
year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1648 |
material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
|
1649 |
|
1650 |
+
with gr.Tab("Base2") as base_tab2:
|
1651 |
image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1652 |
with gr.Row():
|
1653 |
name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
|
|
|
1673 |
add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
|
1674 |
minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
|
1675 |
clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
|
1676 |
+
clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
|
1677 |
+
focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
|
1678 |
+
focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
|
1679 |
+
focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
|
1680 |
focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
|
1681 |
|
1682 |
recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
|
|
|
1746 |
|
1747 |
with gr.Column(visible=False) as recommend:
|
1748 |
gallery_result = gr.Gallery(
|
1749 |
+
label="Result",
|
1750 |
height="auto",
|
1751 |
columns=4
|
1752 |
# columns=4,
|
|
|
1790 |
paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
|
1791 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1792 |
|
|
|
|
|
1793 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1794 |
with gr.Blocks():
|
1795 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
|
1796 |
+
with gr.Column(visible=False) as modules_need_gpt3:
|
1797 |
chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
|
1798 |
with gr.Row():
|
1799 |
clear_button_text = gr.Button(value="Clear Chat", interactive=True)
|
|
|
1808 |
label="narritive",
|
1809 |
scale=5,
|
1810 |
interactive=True)
|
1811 |
+
|
1812 |
|
1813 |
# TTS interface hidden initially
|
1814 |
with gr.Column(visible=False) as tts_interface:
|
|
|
1923 |
# )
|
1924 |
|
1925 |
with gr.Row():
|
1926 |
+
|
1927 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
1928 |
+
with gr.Row(visible=False, elem_id="top_row") as top_row:
|
|
|
|
|
|
|
|
|
|
|
1929 |
language = gr.Dropdown(
|
1930 |
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1931 |
value="English", label="Language", interactive=True, elem_classes="custom-language"
|
|
|
2093 |
|
2094 |
# mv_images = gr.State()
|
2095 |
|
2096 |
+
# chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
|
2097 |
|
2098 |
# submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
2099 |
# fn=generate_mvs,
|
|
|
2150 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
2151 |
modules_not_need_gpt,
|
2152 |
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
|
2153 |
+
# openai_api_key.submit(init_openai_api_key,
|
2154 |
+
# outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
2155 |
+
# modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
2156 |
+
# enable_chatGPT_button.click(init_openai_api_key,
|
2157 |
+
# outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
2158 |
+
# modules_not_need_gpt,
|
2159 |
+
# modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
2160 |
|
2161 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
2162 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
|
|
2196 |
)
|
2197 |
|
2198 |
|
2199 |
+
enable_chatGPT_button.click(
|
2200 |
+
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2201 |
+
[],
|
2202 |
+
[image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2203 |
+
queue=False,
|
2204 |
+
show_progress=False
|
2205 |
+
)
|
2206 |
+
openai_api_key.submit(
|
2207 |
+
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2208 |
+
[],
|
2209 |
+
[image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2210 |
+
queue=False,
|
2211 |
+
show_progress=False
|
2212 |
+
)
|
2213 |
|
2214 |
+
cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
|
2215 |
+
[paragraph_output,output_audio])
|
2216 |
|
2217 |
clear_button_click.click(
|
2218 |
lambda x: ([[], [], []], x),
|
|
|
2222 |
show_progress=False
|
2223 |
)
|
2224 |
clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
|
2225 |
+
clear_button_image.click(
|
2226 |
+
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2227 |
+
[],
|
2228 |
+
[image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2229 |
+
queue=False,
|
2230 |
+
show_progress=False
|
2231 |
+
)
|
2232 |
+
clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
|
2233 |
clear_button_text.click(
|
2234 |
lambda: ([], [], [[], [], [], []]),
|
2235 |
[],
|
|
|
2242 |
image_input.clear(
|
2243 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2244 |
[],
|
2245 |
+
[image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2246 |
queue=False,
|
2247 |
show_progress=False
|
2248 |
)
|
|
|
2252 |
|
2253 |
|
2254 |
|
2255 |
+
image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
|
2256 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2257 |
+
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2258 |
+
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2259 |
+
paragraph,artist,gender,image_path])
|
2260 |
|
2261 |
+
image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
|
2262 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2263 |
+
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2264 |
+
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2265 |
+
paragraph,artist,gender,image_path])
|
2266 |
|
2267 |
image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
|
2268 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
|
|
2317 |
return [gr.update(visible=False)]*4
|
2318 |
|
2319 |
|
2320 |
+
traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2321 |
+
click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2322 |
+
base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
|
2323 |
+
base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
|
2324 |
|
2325 |
|
2326 |
|
|
|
2330 |
inputs=[
|
2331 |
origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
|
2332 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
2333 |
+
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2334 |
],
|
2335 |
+
outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
|
2336 |
show_progress=False, queue=True
|
2337 |
)
|
2338 |
|
|
|
2439 |
naritive.change(
|
2440 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
2441 |
[],
|
2442 |
+
[image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2443 |
queue=False,
|
2444 |
show_progress=False
|
2445 |
|