Niki Zhang commited on
Commit
5e61373
·
verified ·
1 Parent(s): 06d9dec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -88
app.py CHANGED
@@ -554,7 +554,21 @@ focus_map = {
554
  "Judge":3
555
  }
556
 
 
 
 
 
 
 
 
557
 
 
 
 
 
 
 
 
558
  prompt_list = [
559
  [
560
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
@@ -720,7 +734,6 @@ def init_wo_openai_api_key():
720
  # return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
721
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
722
 
723
-
724
  def get_click_prompt(chat_input, click_state, click_mode):
725
  inputs = json.loads(chat_input)
726
  if click_mode == 'Continuous':
@@ -783,10 +796,10 @@ async def chat_input_callback(*args):
783
  audio = await texttospeech(response,language,autoplay,gender)
784
  return state, state, None, audio
785
 
 
786
 
787
  def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
788
  print("narritive", narritive)
789
-
790
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
791
  image_input = image_input['background']
792
 
@@ -818,6 +831,11 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
818
  image_input.save(new_image_path)
819
  visual_chatgpt.current_image = new_image_path
820
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
 
 
 
 
 
821
  print("memory",visual_chatgpt.agent.memory)
822
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
823
  parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
@@ -883,7 +901,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
883
 
884
  def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
885
  length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
886
- out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, log_state, evt: gr.SelectData):
887
  click_index = evt.index
888
 
889
  if point_prompt == 'Positive':
@@ -915,15 +933,20 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
915
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
916
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
917
  # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
918
- log_state= log_state + [("Selected image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
919
- # deviding line
920
- log_state= log_state + [("/////", None)]
921
-
922
- if prompt["input_label"][-1]=="0":
923
- state = state + [("You've added area at {}, ".format(prompt["input_point"]), None)]
924
- else:
925
- state = state + [("You've removed area at {}, ".format(prompt["input_point"]), None)]
926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
  input_mask = np.array(out['mask'].convert('P'))
928
  image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
929
 
@@ -934,11 +957,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
934
  out_state = out
935
 
936
  if visual_chatgpt is not None:
 
937
  new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
938
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
939
- print("new crop save",new_crop_save_path)
 
 
 
 
940
 
941
- yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground,log_state
942
 
943
 
944
  query_focus = {
@@ -948,10 +976,6 @@ query_focus = {
948
  "Judge": "Evaluate the item."
949
  }
950
 
951
- def generate_action(focus_type):
952
- if focus_type == "D":
953
- print()
954
-
955
 
956
  async def submit_caption(naritive, state,length, sentiment, factuality, language,
957
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
@@ -962,17 +986,23 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
962
 
963
  click_index = click_index_state
964
 
 
 
 
 
 
965
  print("click_index",click_index)
966
  print("input_points_state",input_points_state)
967
  print("input_labels_state",input_labels_state)
968
 
969
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
970
 
971
-
972
-
973
  print("Prompt:", prompt)
974
  print("click",click_index)
975
 
 
 
 
976
 
977
  # if not args.disable_gpt and text_refiner:
978
  if not args.disable_gpt:
@@ -998,6 +1028,8 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
998
  print("error gpt responese")
999
  print("item gender",gender)
1000
 
 
 
1001
  try:
1002
  if autoplay==False:
1003
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
@@ -1550,13 +1582,7 @@ def create_ui():
1550
  css=css,
1551
  theme=gr.themes.Base()
1552
  ) as iface:
1553
- #display in the chatbox
1554
  state = gr.State([])
1555
- # expoer in log
1556
- log_state=gr.State([])
1557
- # history log for gpt
1558
- history_log=gr.State([])
1559
-
1560
  out_state = gr.State(None)
1561
  click_state = gr.State([[], [], []])
1562
  origin_image = gr.State(None)
@@ -1582,17 +1608,38 @@ def create_ui():
1582
  log_list=gr.State([])
1583
  gender=gr.State('female')
1584
  image_path=gr.State('')
 
 
 
 
 
 
 
 
 
 
 
 
 
1585
 
1586
- with gr.Row(visible=False, elem_id="top_row") as top_row:
1587
- task = gr.Dropdown(
1588
- ["Session 1: task-based interaction","Session 2: Free-will interaction"],
1589
- value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
1590
- )
1591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1592
  with gr.Row():
1593
  with gr.Column(scale=6):
1594
  with gr.Column(visible=False) as modules_not_need_gpt:
1595
- with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1596
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1597
  with gr.Row():
1598
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1600,7 +1647,7 @@ def create_ui():
1600
  year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1601
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1602
 
1603
- with gr.Tab("Base2",visible=False) as base_tab2:
1604
  image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1605
  with gr.Row():
1606
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1626,9 +1673,10 @@ def create_ui():
1626
  add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1627
  minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1628
  clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1629
- focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
1630
- focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
1631
- focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
 
1632
  focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1633
 
1634
  recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
@@ -1698,7 +1746,7 @@ def create_ui():
1698
 
1699
  with gr.Column(visible=False) as recommend:
1700
  gallery_result = gr.Gallery(
1701
- label="Recommendations",
1702
  height="auto",
1703
  columns=4
1704
  # columns=4,
@@ -1742,12 +1790,10 @@ def create_ui():
1742
  paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1743
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1744
 
1745
-
1746
-
1747
  with gr.Column(visible=False) as modules_not_need_gpt2:
1748
  with gr.Blocks():
1749
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1750
- with gr.Column() as modules_need_gpt3:
1751
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1752
  with gr.Row():
1753
  clear_button_text = gr.Button(value="Clear Chat", interactive=True)
@@ -1762,6 +1808,7 @@ def create_ui():
1762
  label="narritive",
1763
  scale=5,
1764
  interactive=True)
 
1765
 
1766
  # TTS interface hidden initially
1767
  with gr.Column(visible=False) as tts_interface:
@@ -1876,13 +1923,9 @@ def create_ui():
1876
  # )
1877
 
1878
  with gr.Row():
 
1879
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
1880
-
1881
- with gr.Row(elem_id="top_row") as top_row:
1882
- task = gr.Dropdown(
1883
- ["Session 1: task-based interaction","Session 2: Free-will interaction"],
1884
- value="Session 1: task-based interaction", label="Task", interactive=True, elem_classes="custom-language"
1885
- )
1886
  language = gr.Dropdown(
1887
  ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1888
  value="English", label="Language", interactive=True, elem_classes="custom-language"
@@ -2050,7 +2093,7 @@ def create_ui():
2050
 
2051
  # mv_images = gr.State()
2052
 
2053
- chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
2054
 
2055
  # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
2056
  # fn=generate_mvs,
@@ -2107,6 +2150,13 @@ def create_ui():
2107
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2108
  modules_not_need_gpt,
2109
  modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
 
 
 
 
 
 
 
2110
 
2111
  disable_chatGPT_button.click(init_wo_openai_api_key,
2112
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
@@ -2146,23 +2196,23 @@ def create_ui():
2146
  )
2147
 
2148
 
2149
- # enable_chatGPT_button.click(
2150
- # lambda: (None, [], [], [[], [], []], "", "", ""),
2151
- # [],
2152
- # [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2153
- # queue=False,
2154
- # show_progress=False
2155
- # )
2156
- # openai_api_key.submit(
2157
- # lambda: (None, [], [], [[], [], []], "", "", ""),
2158
- # [],
2159
- # [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2160
- # queue=False,
2161
- # show_progress=False
2162
- # )
2163
 
2164
- # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2165
- # [paragraph_output,output_audio])
2166
 
2167
  clear_button_click.click(
2168
  lambda x: ([[], [], []], x),
@@ -2172,14 +2222,14 @@ def create_ui():
2172
  show_progress=False
2173
  )
2174
  clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
2175
- # clear_button_image.click(
2176
- # lambda: (None, [], [], [[], [], []], "", "", ""),
2177
- # [],
2178
- # [image_input, chatbot, state, click_state, paragraph, origin_image],
2179
- # queue=False,
2180
- # show_progress=False
2181
- # )
2182
- # clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
2183
  clear_button_text.click(
2184
  lambda: ([], [], [[], [], [], []]),
2185
  [],
@@ -2192,7 +2242,7 @@ def create_ui():
2192
  image_input.clear(
2193
  lambda: (None, [], [], [[], [], []], "", "", ""),
2194
  [],
2195
- [image_input, chatbot, state, click_state, paragraph, origin_image],
2196
  queue=False,
2197
  show_progress=False
2198
  )
@@ -2202,17 +2252,17 @@ def create_ui():
2202
 
2203
 
2204
 
2205
- # image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
2206
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2207
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2208
- # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2209
- # paragraph,artist,gender,image_path])
2210
 
2211
- # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
2212
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2213
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2214
- # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2215
- # paragraph,artist,gender,image_path])
2216
 
2217
  image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
2218
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2267,10 +2317,10 @@ def create_ui():
2267
  return [gr.update(visible=False)]*4
2268
 
2269
 
2270
- # traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2271
- # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2272
- # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2273
- # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2274
 
2275
 
2276
 
@@ -2280,9 +2330,9 @@ def create_ui():
2280
  inputs=[
2281
  origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
2282
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
2283
- out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, log_state
2284
  ],
2285
- outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground,log_state],
2286
  show_progress=False, queue=True
2287
  )
2288
 
@@ -2389,7 +2439,7 @@ def create_ui():
2389
  naritive.change(
2390
  lambda: (None, [], [], [[], [], []], "", "", ""),
2391
  [],
2392
- [image_input, chatbot, state, click_state, paragraph, origin_image],
2393
  queue=False,
2394
  show_progress=False
2395
 
 
554
  "Judge":3
555
  }
556
 
557
+ '''
558
+ prompt_list = [
559
+ 'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
560
+ 'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
561
+ 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
562
+ 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
563
+ ]
564
 
565
+ prompt_list = [
566
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
567
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
568
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
569
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
570
+ ]
571
+ '''
572
  prompt_list = [
573
  [
574
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
 
734
  # return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
735
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
736
 
 
737
  def get_click_prompt(chat_input, click_state, click_mode):
738
  inputs = json.loads(chat_input)
739
  if click_mode == 'Continuous':
 
796
  audio = await texttospeech(response,language,autoplay,gender)
797
  return state, state, None, audio
798
 
799
+
800
 
801
  def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
802
  print("narritive", narritive)
 
803
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
804
  image_input = image_input['background']
805
 
 
831
  image_input.save(new_image_path)
832
  visual_chatgpt.current_image = new_image_path
833
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
834
+ # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
835
+ Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
836
+ AI_prompt = "Received."
837
+ visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
838
+ visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
839
  print("memory",visual_chatgpt.agent.memory)
840
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
841
  parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
 
901
 
902
  def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
903
  length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
904
+ out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, evt: gr.SelectData):
905
  click_index = evt.index
906
 
907
  if point_prompt == 'Positive':
 
933
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
934
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
935
  # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
 
 
 
 
 
 
 
 
936
 
937
+ # state = state + [("Selected image point: {}, Input label: {}".format(
938
+ # prompt["input_point"],
939
+ # '+' if prompt["input_label"] == "1" else '-'
940
+ # ), None)]
941
+
942
+ output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
943
+
944
+ state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
945
+
946
+
947
+
948
+ # update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
949
+ text = out['generated_captions']['raw_caption']
950
  input_mask = np.array(out['mask'].convert('P'))
951
  image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
952
 
 
957
  out_state = out
958
 
959
  if visual_chatgpt is not None:
960
+ print('inference_click: add caption to chatGPT memory')
961
  new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
962
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
963
+ point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
964
+ visual_chatgpt.point_prompt = point_prompt
965
+
966
+
967
+ print("new crop save",new_crop_save_path)
968
 
969
+ yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
970
 
971
 
972
  query_focus = {
 
976
  "Judge": "Evaluate the item."
977
  }
978
 
 
 
 
 
979
 
980
  async def submit_caption(naritive, state,length, sentiment, factuality, language,
981
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
 
986
 
987
  click_index = click_index_state
988
 
989
+ # if pre_click_index==click_index:
990
+ # click_index = (click_index[0] - 1, click_index[1] - 1)
991
+ # pre_click_index = click_index
992
+ # else:
993
+ # pre_click_index = click_index
994
  print("click_index",click_index)
995
  print("input_points_state",input_points_state)
996
  print("input_labels_state",input_labels_state)
997
 
998
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
999
 
 
 
1000
  print("Prompt:", prompt)
1001
  print("click",click_index)
1002
 
1003
+ # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
1004
+ # input_points=input_points, input_labels=input_labels)
1005
+
1006
 
1007
  # if not args.disable_gpt and text_refiner:
1008
  if not args.disable_gpt:
 
1028
  print("error gpt responese")
1029
  print("item gender",gender)
1030
 
1031
+ # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1032
+ # input_points=input_points, input_labels=input_labels)
1033
  try:
1034
  if autoplay==False:
1035
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
 
1582
  css=css,
1583
  theme=gr.themes.Base()
1584
  ) as iface:
 
1585
  state = gr.State([])
 
 
 
 
 
1586
  out_state = gr.State(None)
1587
  click_state = gr.State([[], [], []])
1588
  origin_image = gr.State(None)
 
1608
  log_list=gr.State([])
1609
  gender=gr.State('female')
1610
  image_path=gr.State('')
1611
+ # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
1612
+ # with gr.Column(scale=0.5):
1613
+ # # gr.Markdown("Left side content")
1614
+
1615
+ # with gr.Column(scale=0.5):
1616
+ # with gr.Row(align="right",visible=False) as language_select:
1617
+ # language = gr.Dropdown(
1618
+ # ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1619
+ # value="English", label="Language", interactive=True)
1620
+
1621
+ # with gr.Row(align="right",visible=False) as autoplay:
1622
+ # auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1623
+ # output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1624
 
 
 
 
 
 
1625
 
1626
+
1627
+
1628
+
1629
+
1630
+ # with gr.Row(align="right",visible=False) as language_select:
1631
+ # language = gr.Dropdown(
1632
+ # ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1633
+ # value="English", label="Language", interactive=True)
1634
+
1635
+ # with gr.Row(align="right",visible=False) as autoplay:
1636
+ # auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1637
+ # output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1638
+
1639
  with gr.Row():
1640
  with gr.Column(scale=6):
1641
  with gr.Column(visible=False) as modules_not_need_gpt:
1642
+ with gr.Tab("Base(GPT Power)") as base_tab:
1643
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1644
  with gr.Row():
1645
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
 
1647
  year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1648
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1649
 
1650
+ with gr.Tab("Base2") as base_tab2:
1651
  image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1652
  with gr.Row():
1653
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
 
1673
  add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1674
  minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1675
  clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1676
+ clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
1677
+ focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
1678
+ focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
1679
+ focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
1680
  focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1681
 
1682
  recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
 
1746
 
1747
  with gr.Column(visible=False) as recommend:
1748
  gallery_result = gr.Gallery(
1749
+ label="Result",
1750
  height="auto",
1751
  columns=4
1752
  # columns=4,
 
1790
  paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1791
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1792
 
 
 
1793
  with gr.Column(visible=False) as modules_not_need_gpt2:
1794
  with gr.Blocks():
1795
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1796
+ with gr.Column(visible=False) as modules_need_gpt3:
1797
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1798
  with gr.Row():
1799
  clear_button_text = gr.Button(value="Clear Chat", interactive=True)
 
1808
  label="narritive",
1809
  scale=5,
1810
  interactive=True)
1811
+
1812
 
1813
  # TTS interface hidden initially
1814
  with gr.Column(visible=False) as tts_interface:
 
1923
  # )
1924
 
1925
  with gr.Row():
1926
+
1927
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
1928
+ with gr.Row(visible=False, elem_id="top_row") as top_row:
 
 
 
 
 
1929
  language = gr.Dropdown(
1930
  ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1931
  value="English", label="Language", interactive=True, elem_classes="custom-language"
 
2093
 
2094
  # mv_images = gr.State()
2095
 
2096
+ # chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
2097
 
2098
  # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
2099
  # fn=generate_mvs,
 
2150
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2151
  modules_not_need_gpt,
2152
  modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
2153
+ # openai_api_key.submit(init_openai_api_key,
2154
+ # outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
2155
+ # modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
2156
+ # enable_chatGPT_button.click(init_openai_api_key,
2157
+ # outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2158
+ # modules_not_need_gpt,
2159
+ # modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
2160
 
2161
  disable_chatGPT_button.click(init_wo_openai_api_key,
2162
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
 
2196
  )
2197
 
2198
 
2199
+ enable_chatGPT_button.click(
2200
+ lambda: (None, [], [], [[], [], []], "", "", ""),
2201
+ [],
2202
+ [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2203
+ queue=False,
2204
+ show_progress=False
2205
+ )
2206
+ openai_api_key.submit(
2207
+ lambda: (None, [], [], [[], [], []], "", "", ""),
2208
+ [],
2209
+ [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2210
+ queue=False,
2211
+ show_progress=False
2212
+ )
2213
 
2214
+ cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2215
+ [paragraph_output,output_audio])
2216
 
2217
  clear_button_click.click(
2218
  lambda x: ([[], [], []], x),
 
2222
  show_progress=False
2223
  )
2224
  clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
2225
+ clear_button_image.click(
2226
+ lambda: (None, [], [], [[], [], []], "", "", ""),
2227
+ [],
2228
+ [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2229
+ queue=False,
2230
+ show_progress=False
2231
+ )
2232
+ clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
2233
  clear_button_text.click(
2234
  lambda: ([], [], [[], [], [], []]),
2235
  [],
 
2242
  image_input.clear(
2243
  lambda: (None, [], [], [[], [], []], "", "", ""),
2244
  [],
2245
+ [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2246
  queue=False,
2247
  show_progress=False
2248
  )
 
2252
 
2253
 
2254
 
2255
+ image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
2256
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2257
+ image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2258
+ name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2259
+ paragraph,artist,gender,image_path])
2260
 
2261
+ image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
2262
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2263
+ image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2264
+ name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2265
+ paragraph,artist,gender,image_path])
2266
 
2267
  image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
2268
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2317
  return [gr.update(visible=False)]*4
2318
 
2319
 
2320
+ traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2321
+ click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2322
+ base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2323
+ base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2324
 
2325
 
2326
 
 
2330
  inputs=[
2331
  origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
2332
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
2333
+ out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2334
  ],
2335
+ outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
2336
  show_progress=False, queue=True
2337
  )
2338
 
 
2439
  naritive.change(
2440
  lambda: (None, [], [], [[], [], []], "", "", ""),
2441
  [],
2442
+ [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2443
  queue=False,
2444
  show_progress=False
2445