EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 28, 2024

Commit

5e61373

verified ·

1 Parent(s): 06d9dec

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -88

app.py CHANGED Viewed

@@ -554,7 +554,21 @@ focus_map = {
 "Judge":3
 }
 prompt_list = [
     [
         'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
@@ -720,7 +734,6 @@ def init_wo_openai_api_key():
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
     if click_mode == 'Continuous':
@@ -783,10 +796,10 @@ async def chat_input_callback(*args):
         audio = await texttospeech(response,language,autoplay,gender)
         return state, state, None, audio
 def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
     print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
@@ -818,6 +831,11 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
         parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
@@ -883,7 +901,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
 def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
                     length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
-                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, log_state, evt: gr.SelectData):
     click_index = evt.index
     if point_prompt == 'Positive':
@@ -915,15 +933,20 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
-    log_state=  log_state + [("Selected image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
-    # deviding line
-    log_state=  log_state + [("/////", None)]
-    if prompt["input_label"][-1]=="0":
-        state = state + [("You've added area at {}, ".format(prompt["input_point"]), None)]
-    else:
-        state = state + [("You've removed area at {}, ".format(prompt["input_point"]), None)]
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
@@ -934,11 +957,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     out_state = out
     if visual_chatgpt is not None:
         new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
         Image.open(out["crop_save_path"]).save(new_crop_save_path)
-        print("new crop save",new_crop_save_path)
-    yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground,log_state
 query_focus = {
@@ -948,10 +976,6 @@ query_focus = {
     "Judge": "Evaluate the item."
 }
-def generate_action(focus_type):
-    if focus_type == "D":
-        print()
 async def submit_caption(naritive, state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
@@ -962,17 +986,23 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
     click_index = click_index_state
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
     prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
     print("Prompt:", prompt)
     print("click",click_index)
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
@@ -998,6 +1028,8 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
                 print("error gpt responese")
             print("item gender",gender)
         try:
             if autoplay==False:
                 return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
@@ -1550,13 +1582,7 @@ def create_ui():
             css=css,
             theme=gr.themes.Base()
     ) as iface:
-        #display in the chatbox
         state = gr.State([])
-        # expoer in log
-        log_state=gr.State([])
-        # history log for gpt
-        history_log=gr.State([])
         out_state = gr.State(None)
         click_state = gr.State([[], [], []])
         origin_image = gr.State(None)
@@ -1582,17 +1608,38 @@ def create_ui():
         log_list=gr.State([])
         gender=gr.State('female')
         image_path=gr.State('')
-        with gr.Row(visible=False, elem_id="top_row") as top_row:
-            task = gr.Dropdown(
-            ["Session 1: task-based interaction","Session 2: Free-will interaction"],
-            value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
-        )
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
-                    with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1600,7 +1647,7 @@ def create_ui():
                             year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
-                    with gr.Tab("Base2",visible=False) as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1626,9 +1673,10 @@ def create_ui():
                                 add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
                                 minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
-                                focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
-                                focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
-                                focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
                                 focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
                                 recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
@@ -1698,7 +1746,7 @@ def create_ui():
                 with gr.Column(visible=False) as recommend:
                     gallery_result = gr.Gallery(
-                    label="Recommendations",
                     height="auto",
                     columns=4
                     # columns=4,
@@ -1742,12 +1790,10 @@ def create_ui():
                         paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
                     with gr.Blocks():
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
-                        with gr.Column() as modules_need_gpt3:
                             chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Chat", interactive=True)
@@ -1762,6 +1808,7 @@ def create_ui():
                                 label="narritive",
                                 scale=5,
                                 interactive=True)
                 # TTS interface hidden initially
             with gr.Column(visible=False) as tts_interface:
@@ -1876,13 +1923,9 @@ def create_ui():
             #     )
         with gr.Row():
             chat_log_file = gr.File(label="Download Chat Log",scale=5)
-        with gr.Row(elem_id="top_row") as top_row:
-            task = gr.Dropdown(
-            ["Session 1: task-based interaction","Session 2: Free-will interaction"],
-            value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
-        )
             language = gr.Dropdown(
             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
             value="English", label="Language", interactive=True, elem_classes="custom-language"
@@ -2050,7 +2093,7 @@ def create_ui():
         # mv_images = gr.State()
-        chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
         # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
         #     fn=generate_mvs,
@@ -2107,6 +2150,13 @@ def create_ui():
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
                                              modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
@@ -2146,23 +2196,23 @@ def create_ui():
         )
-        # enable_chatGPT_button.click(
-        #     lambda: (None, [], [], [[], [], []], "", "", ""),
-        #     [],
-        #     [image_input, chatbot, state, click_state, paragraph_output, origin_image],
-        #     queue=False,
-        #     show_progress=False
-        # )
-        # openai_api_key.submit(
-        #     lambda: (None, [], [], [[], [], []], "", "", ""),
-        #     [],
-        #     [image_input, chatbot, state, click_state, paragraph_output, origin_image],
-        #     queue=False,
-        #     show_progress=False
-        # )
-        # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
-        #                             [paragraph_output,output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
@@ -2172,14 +2222,14 @@ def create_ui():
             show_progress=False
         )
         clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
-        # clear_button_image.click(
-        #     lambda: (None, [], [], [[], [], []], "", "", ""),
-        #     [],
-        #     [image_input, chatbot, state, click_state, paragraph, origin_image],
-        #     queue=False,
-        #     show_progress=False
-        # )
-        # clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
         clear_button_text.click(
             lambda: ([], [], [[], [], [], []]),
             [],
@@ -2192,7 +2242,7 @@ def create_ui():
         image_input.clear(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, paragraph, origin_image],
             queue=False,
             show_progress=False
         )
@@ -2202,17 +2252,17 @@ def create_ui():
-        # image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
-        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-        #                         paragraph,artist,gender,image_path])
-        # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
-        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-        #                         paragraph,artist,gender,image_path])
         image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2267,10 +2317,10 @@ def create_ui():
                 return [gr.update(visible=False)]*4
-        # traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
-        # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
-        # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
-        # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
@@ -2280,9 +2330,9 @@ def create_ui():
             inputs=[
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
-                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, log_state
             ],
-            outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground,log_state],
             show_progress=False, queue=True
         )
@@ -2389,7 +2439,7 @@ def create_ui():
         naritive.change(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, paragraph, origin_image],
             queue=False,
             show_progress=False

 "Judge":3
 }
+'''
+prompt_list = [
+'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
+]
+prompt_list = [
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
+]
+'''
 prompt_list = [
     [
         'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
     if click_mode == 'Continuous':
         audio = await texttospeech(response,language,autoplay,gender)
         return state, state, None, audio
 def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
     print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
+        # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
+        Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
+        AI_prompt = "Received."
+        visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+        visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
         parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
 def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
                     length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
+                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, evt: gr.SelectData):
     click_index = evt.index
     if point_prompt == 'Positive':
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
+    # state = state + [("Selected image point: {}, Input label: {}".format(
+    #     prompt["input_point"],
+    #     '+' if prompt["input_label"] == "1" else '-'
+    # ), None)]
+    output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
+    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
+    # update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
+    text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
     out_state = out
     if visual_chatgpt is not None:
+        print('inference_click: add caption to chatGPT memory')
         new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
         Image.open(out["crop_save_path"]).save(new_crop_save_path)
+        point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
+        visual_chatgpt.point_prompt = point_prompt
+    print("new crop save",new_crop_save_path)
+    yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
 query_focus = {
     "Judge": "Evaluate the item."
 }
 async def submit_caption(naritive, state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
     click_index = click_index_state
+    # if pre_click_index==click_index:
+    #     click_index = (click_index[0] - 1, click_index[1] - 1)
+    #     pre_click_index = click_index
+    # else:
+    #     pre_click_index = click_index
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
     prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
     print("Prompt:", prompt)
     print("click",click_index)
+    # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
+    #                                   input_points=input_points, input_labels=input_labels)
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
                 print("error gpt responese")
             print("item gender",gender)
+        # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+        #                                           input_points=input_points, input_labels=input_labels)
         try:
             if autoplay==False:
                 return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
             css=css,
             theme=gr.themes.Base()
     ) as iface:
         state = gr.State([])
         out_state = gr.State(None)
         click_state = gr.State([[], [], []])
         origin_image = gr.State(None)
         log_list=gr.State([])
         gender=gr.State('female')
         image_path=gr.State('')
+        # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
+        #     with gr.Column(scale=0.5):
+        #         # gr.Markdown("Left side content")
+        #     with gr.Column(scale=0.5):
+        #         with gr.Row(align="right",visible=False) as language_select:
+        #             language = gr.Dropdown(
+        #                 ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
+        #                 value="English", label="Language", interactive=True)
+        #         with gr.Row(align="right",visible=False) as autoplay:
+        #             auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
+        #             output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
+        # with gr.Row(align="right",visible=False) as language_select:
+        #     language = gr.Dropdown(
+        #         ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
+        #         value="English", label="Language", interactive=True)
+        # with gr.Row(align="right",visible=False) as autoplay:
+        #     auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
+        #     output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
+                    with gr.Tab("Base(GPT Power)") as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
                             year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
+                    with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
                                 add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
                                 minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
+                                clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
+                                focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
                                 focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
                                 recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
                 with gr.Column(visible=False) as recommend:
                     gallery_result = gr.Gallery(
+                    label="Result",
                     height="auto",
                     columns=4
                     # columns=4,
                         paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
                     with gr.Blocks():
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
+                        with gr.Column(visible=False) as modules_need_gpt3:
                             chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Chat", interactive=True)
                                 label="narritive",
                                 scale=5,
                                 interactive=True)
                 # TTS interface hidden initially
             with gr.Column(visible=False) as tts_interface:
             #     )
         with gr.Row():
             chat_log_file = gr.File(label="Download Chat Log",scale=5)
+        with gr.Row(visible=False, elem_id="top_row") as top_row:
             language = gr.Dropdown(
             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
             value="English", label="Language", interactive=True, elem_classes="custom-language"
         # mv_images = gr.State()
+        # chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
         # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
         #     fn=generate_mvs,
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
                                              modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
+        # openai_api_key.submit(init_openai_api_key,
+        #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+        #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
+        # enable_chatGPT_button.click(init_openai_api_key,
+        #                             outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
+        #                                      modules_not_need_gpt,
+        #                                      modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
         )
+        enable_chatGPT_button.click(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
+        openai_api_key.submit(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
+        cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
+                                    [paragraph_output,output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
             show_progress=False
         )
         clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
+        clear_button_image.click(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
+        clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
         clear_button_text.click(
             lambda: ([], [], [[], [], [], []]),
             [],
         image_input.clear(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
+        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
+                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                                paragraph,artist,gender,image_path])
+        image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
+                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                                paragraph,artist,gender,image_path])
         image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                 return [gr.update(visible=False)]*4
+        traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
+        click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
+        base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
+        base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
             inputs=[
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
+                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
+            outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
         )
         naritive.change(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False