Niki Zhang commited on
Commit
9ef960d
·
verified ·
1 Parent(s): 10d7824

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +565 -445
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from io import BytesIO
2
  import io
3
  from math import inf
@@ -346,20 +347,33 @@ def extract_features_siglip(image):
346
  return image_features
347
 
348
  @spaces.GPU
349
- def infer(image_path):
350
- input_image = Image.open(image_path).convert("RGB")
351
- input_features = extract_features_siglip(input_image.convert("RGB"))
352
- input_features = input_features.detach().cpu().numpy()
353
- input_features = np.float32(input_features)
354
- faiss.normalize_L2(input_features)
355
- distances, indices = index.search(input_features, 3)
356
- gallery_output = []
357
- for i,v in enumerate(indices[0]):
358
- sim = -distances[0][i]
359
- image_url = df.iloc[v]["Link"]
360
- img_retrieved = read_image_from_url(image_url)
361
- gallery_output.append(img_retrieved)
362
- return gallery_output
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
 
365
  ###############################################################################
@@ -547,28 +561,14 @@ filtered_language_dict = {
547
  'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
548
  }
549
 
550
- focus_map = {
551
- "D":0,
552
- "DA":1,
553
- "DAI":2,
554
  "Judge":3
555
  }
556
 
557
- '''
558
- prompt_list = [
559
- 'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
560
- 'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
561
- 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
562
- 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
563
- ]
564
 
565
- prompt_list = [
566
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
567
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
568
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
569
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
570
- ]
571
- '''
572
  prompt_list = [
573
  [
574
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
@@ -722,11 +722,11 @@ def init_openai_api_key(api_key=""):
722
  global gpt_state
723
  gpt_state=1
724
  # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
725
- return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
726
  else:
727
  gpt_state=0
728
  # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
729
- return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
730
 
731
  def init_wo_openai_api_key():
732
  global gpt_state
@@ -734,6 +734,7 @@ def init_wo_openai_api_key():
734
  # return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
735
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
736
 
 
737
  def get_click_prompt(chat_input, click_state, click_mode):
738
  inputs = json.loads(chat_input)
739
  if click_mode == 'Continuous':
@@ -771,35 +772,38 @@ def update_click_state(click_state, caption, click_mode):
771
  raise NotImplementedError
772
 
773
  async def chat_input_callback(*args):
774
- visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input = args
775
  message = chat_input["text"]
776
  prompt="Please help me answer the question with this painting {question} in {language}."
777
  prompt=prompt.format(question=message, language=language)
778
- state = state + [(message,None)]
779
  if visual_chatgpt is not None:
780
- result=get_gpt_response(api_key, image_input,prompt+message)
781
- state = state + [(None, result)]
782
  read_info = re.sub(r'[#[\]!*]','',result)
783
  read_info = emoji.replace_emoji(read_info,replace="")
784
- # state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
785
- last_text, last_response = state[-1]
786
- print("last response",last_response)
 
 
 
 
787
  if autoplay==False:
788
- return state, state, aux_state, None
789
 
790
  else:
791
- audio = await texttospeech(read_info,language,autoplay,gender)
792
- return state, state, aux_state, audio
793
  else:
794
  response = "Text refiner is not initilzed, please input openai api key."
795
  state = state + [(chat_input, response)]
796
- audio = await texttospeech(response,language,autoplay,gender)
797
- return state, state, None, audio
798
-
799
 
800
 
801
- def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
802
  print("narritive", narritive)
 
803
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
804
  image_input = image_input['background']
805
 
@@ -810,6 +814,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
810
 
811
 
812
  click_state = [[], [], []]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
  image_input = image_resize(image_input, res=1024)
814
 
815
  model = build_caption_anything_with_models(
@@ -831,18 +850,14 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
831
  image_input.save(new_image_path)
832
  visual_chatgpt.current_image = new_image_path
833
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
834
- # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
835
- Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
836
- AI_prompt = "Received."
837
- visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
838
- visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
839
- print("memory",visual_chatgpt.agent.memory)
840
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
841
  parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
 
842
  parsed_data = json.loads(parsed_data.replace("'", "\""))
843
  name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
844
  gender=gender.lower()
845
  print("gender",gender)
 
846
 
847
 
848
  if language=="English":
@@ -888,13 +903,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
888
  None,
889
  f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
890
  )
891
- ]
 
 
 
 
 
 
 
 
 
892
 
893
-
894
-
895
 
896
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
897
- original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path]
898
 
899
 
900
 
@@ -933,20 +956,23 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
933
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
934
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
935
  # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
936
-
937
- # state = state + [("Selected image point: {}, Input label: {}".format(
938
- # prompt["input_point"],
939
- # '+' if prompt["input_label"] == "1" else '-'
940
- # ), None)]
941
 
942
- output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
943
 
944
- state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
945
-
946
-
 
 
 
 
 
 
 
 
947
 
948
- # update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
949
- text = out['generated_captions']['raw_caption']
950
  input_mask = np.array(out['mask'].convert('P'))
951
  image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
952
 
@@ -957,62 +983,75 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
957
  out_state = out
958
 
959
  if visual_chatgpt is not None:
960
- print('inference_click: add caption to chatGPT memory')
961
  new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
962
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
963
- point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
964
- visual_chatgpt.point_prompt = point_prompt
965
-
966
-
967
- print("new crop save",new_crop_save_path)
968
 
969
  yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
970
 
971
 
972
- query_focus = {
973
- "D": "Provide a description of the item.",
974
- "DA": "Provide a description and analysis of the item.",
975
- "DAI": "Provide a description, analysis, and interpretation of the item.",
976
- "Judge": "Evaluate the item."
977
- }
 
 
 
 
 
 
 
978
 
979
 
980
  async def submit_caption(naritive, state,length, sentiment, factuality, language,
981
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
982
- autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender):
983
 
984
 
985
- state = state + [(query_focus[focus_type], None)]
986
-
987
  click_index = click_index_state
988
 
989
- # if pre_click_index==click_index:
990
- # click_index = (click_index[0] - 1, click_index[1] - 1)
991
- # pre_click_index = click_index
992
- # else:
993
- # pre_click_index = click_index
994
  print("click_index",click_index)
995
  print("input_points_state",input_points_state)
996
  print("input_labels_state",input_labels_state)
997
 
998
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
999
 
 
 
 
1000
  print("Prompt:", prompt)
1001
  print("click",click_index)
1002
 
1003
- # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
1004
- # input_points=input_points, input_labels=input_labels)
1005
-
1006
 
1007
  # if not args.disable_gpt and text_refiner:
1008
  if not args.disable_gpt:
1009
  print("new crop save",new_crop_save_path)
1010
- focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
1011
  if focus_info.startswith('"') and focus_info.endswith('"'):
1012
  focus_info=focus_info[1:-1]
1013
  focus_info=focus_info.replace('#', '')
1014
  # state = state + [(None, f"Wiki: {paragraph}")]
1015
- state = state + [(None, f"{focus_info}")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1016
  print("new_cap",focus_info)
1017
  read_info = re.sub(r'[#[\]!*]','',focus_info)
1018
  read_info = emoji.replace_emoji(read_info,replace="")
@@ -1028,27 +1067,25 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1028
  print("error gpt responese")
1029
  print("item gender",gender)
1030
 
1031
- # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1032
- # input_points=input_points, input_labels=input_labels)
1033
  try:
1034
  if autoplay==False:
1035
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
1036
 
1037
- audio_output = await texttospeech(read_info, language, autoplay,gender)
1038
  print("done")
1039
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1040
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
1041
 
1042
  except Exception as e:
1043
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1044
  print(f"Error during TTS prediction: {str(e)}")
1045
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
1046
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
1047
 
1048
  else:
1049
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1050
  print(f"Error during TTS prediction: {str(e)}")
1051
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None
1052
 
1053
 
1054
 
@@ -1088,70 +1125,57 @@ def encode_image(image_path):
1088
  with open(image_path, "rb") as image_file:
1089
  return base64.b64encode(image_file.read()).decode('utf-8')
1090
 
1091
- def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
1092
- headers = {
1093
- "Content-Type": "application/json",
1094
- "Authorization": f"Bearer {api_key}"
1095
- }
1096
 
1097
  headers = {
1098
  "Content-Type": "application/json",
1099
  "Authorization": f"Bearer {api_key}"
1100
  }
1101
- base64_images=[]
1102
 
 
 
 
 
 
 
 
 
1103
  if image_path:
1104
- if isinstance(image_path, list):
1105
-
1106
- for img in image_path:
1107
- base64_image = encode_image(img)
1108
- base64_images.append(base64_image)
1109
- else:
1110
- base64_image = encode_image(image_path)
1111
- base64_images.append(base64_image)
1112
-
1113
- payload = {
1114
- "model": "gpt-4o",
1115
- "messages": [
1116
  {
1117
- "role": "user",
1118
- "content": [
1119
- {
1120
- "type": "text",
1121
- "text": prompt
1122
- },
1123
- {
1124
- "type": "image_url",
1125
- "image_url": {
1126
- "url": f"data:image/jpeg;base64,{base64_images}"
1127
- }
1128
- }
1129
- ]
1130
- }
1131
- ],
1132
- "max_tokens": 300
1133
- }
1134
- else:
1135
- payload = {
1136
- "model": "gpt-4o",
1137
- "messages": [
1138
- {
1139
- "role": "user",
1140
- "content": [
1141
- {
1142
- "type": "text",
1143
- "text": prompt
1144
  }
1145
- ]
1146
- }
1147
- ],
1148
- "max_tokens": 300
1149
- }
 
 
 
 
 
 
 
 
 
 
 
 
1150
 
1151
  # Sending the request to the OpenAI API
1152
  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
1153
  result = response.json()
1154
-
1155
  try:
1156
  content = result['choices'][0]['message']['content']
1157
  return content
@@ -1295,61 +1319,62 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
1295
  visual_chatgpt.global_prompt = ""
1296
 
1297
 
1298
- def export_chat_log(chat_state, paragraph, liked, disliked,log_list):
1299
  try:
 
1300
  if not chat_state:
1301
  return None
1302
- chat_log = f"Image Description: {paragraph}\n\n"
1303
  for entry in chat_state:
1304
  user_message, bot_response = entry
1305
  if user_message and bot_response:
1306
  chat_log += f"User: {user_message}\nBot: {bot_response}\n"
 
 
1307
  elif user_message:
1308
  chat_log += f"User: {user_message}\n"
 
1309
  elif bot_response:
1310
  chat_log += f"Bot: {bot_response}\n"
 
1311
 
1312
- # 添加 liked 和 disliked 信息
1313
- chat_log += "\nLiked Responses:\n"
1314
- for response in liked:
1315
- chat_log += f"{response}\n"
1316
 
1317
- chat_log += "\nDisliked Responses:\n"
1318
- for response in disliked:
1319
- chat_log += f"{response}\n"
1320
 
1321
- print("export log...")
1322
- print("chat_log", chat_log)
1323
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
1324
- temp_file.write(chat_log.encode('utf-8'))
1325
- temp_file_path = temp_file.name
1326
- print(temp_file_path)
1327
- log_list.append(temp_file_path)
1328
  return log_list,log_list
1329
  except Exception as e:
1330
  print(f"An error occurred while exporting the chat log: {e}")
1331
- return None
1332
 
1333
- async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
1334
  prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
1335
  res=get_gpt_response(api_key,None,prompt)
1336
  state = state + [(None, res)]
1337
  read_info = re.sub(r'[#[\]!*]','',res)
1338
  read_info = emoji.replace_emoji(read_info,replace="")
 
1339
 
1340
 
1341
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1342
  # input_points=input_points, input_labels=input_labels)
1343
  if autoplay:
1344
- audio_output = await texttospeech(read_info, language,autoplay)
1345
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1346
- return state, state,audio_output
1347
- return state, state,None
1348
 
1349
 
1350
- async def get_yearinfo(year,api_key,state,language,autoplay,length):
1351
  prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
1352
  res=get_gpt_response(api_key,None,prompt)
 
1353
  state = state + [(None, res)]
1354
  read_info = re.sub(r'[#[\]!*]','',res)
1355
  read_info = emoji.replace_emoji(read_info,replace="")
@@ -1358,47 +1383,47 @@ async def get_yearinfo(year,api_key,state,language,autoplay,length):
1358
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1359
  # input_points=input_points, input_labels=input_labels)
1360
  if autoplay:
1361
- audio_output = await texttospeech(read_info, language,autoplay)
1362
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1363
- return state, state,audio_output
1364
- return state, state,None
1365
 
1366
 
1367
 
1368
 
1369
 
1370
- async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
1371
 
1372
- # state = state + [(None, f"Caption Everything: {paragraph}")]
1373
- Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1374
- AI_prompt = "Received."
1375
- visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
1376
 
1377
- # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1378
- visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
1379
- # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
1380
- audio_output=await texttospeech(paragraph,language,autoplay)
1381
- return paragraph,audio_output
1382
 
1383
- def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
1384
 
1385
- model = build_caption_anything_with_models(
1386
- args,
1387
- api_key="",
1388
- captioner=shared_captioner,
1389
- sam_model=shared_sam_model,
1390
- ocr_reader=shared_ocr_reader,
1391
- text_refiner=text_refiner,
1392
- session_id=iface.app_id
1393
- )
1394
- paragraph = model.inference_cap_everything(image_input, verbose=True)
1395
- # state = state + [(None, f"Caption Everything: {paragraph}")]
1396
- Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1397
- AI_prompt = "Received."
1398
- visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
1399
- visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
1400
- # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1401
- return paragraph
1402
 
1403
 
1404
 
@@ -1490,62 +1515,119 @@ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragr
1490
 
1491
  # return like_state, dislike_state
1492
 
1493
- async def texttospeech(text, language, autoplay,gender='female'):
1494
  try:
1495
- if autoplay:
1496
- voice = filtered_language_dict[language][gender]
1497
- communicate = edge_tts.Communicate(text=text, voice=voice,rate="+25%")
1498
- file_path = "output.wav"
1499
- await communicate.save(file_path)
1500
- with open(file_path, "rb") as audio_file:
1501
- audio_bytes = BytesIO(audio_file.read())
1502
- audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
1503
- print("TTS processing completed.")
1504
- audio_style = 'style="width:210px;"'
1505
- audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
1506
- else:
1507
- audio_player = None
1508
- print("Autoplay is disabled.")
1509
  return audio_player
 
1510
  except Exception as e:
1511
  print(f"Error in texttospeech: {e}")
1512
  return None
1513
 
1514
- async def associate(focus_info,openai_api_key,language,state,autoplay,length, evt: gr.SelectData):
 
1515
  rec_path=evt._data['value']['image']['path']
 
1516
  print("rec_path",rec_path)
1517
  prompt="""
1518
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
1519
  """
1520
  prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
1521
  result=get_gpt_response(openai_api_key, rec_path, prompt)
1522
- state = state + [(None, f"{result}")]
 
 
 
 
1523
  read_info = re.sub(r'[#[\]!*]','',result)
1524
  read_info = emoji.replace_emoji(read_info,replace="")
1525
  print("associate",read_info)
1526
  if autoplay:
1527
- audio_output = await texttospeech(read_info, language, autoplay)
1528
- return state,state,audio_output
1529
- return state,state,None
1530
-
1531
-
1532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1533
 
 
1534
 
1535
 
1536
- def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
1537
  print(x.index, x.value, x.liked)
1538
  if x.liked == True:
1539
  print("liked")
1540
- like_res.append(x.value)
1541
- print(like_res)
1542
  state = state + [(None, f"Liked Received 👍")]
1543
  else:
1544
- dislike_res.append(x.value)
1545
  state = state + [(None, f"Disliked Received 👎")]
1546
- return like_res,dislike_res,state
1547
-
1548
 
 
 
 
 
 
 
 
1549
 
1550
  def toggle_icons_and_update_prompt(point_prompt):
1551
  new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
@@ -1568,13 +1650,13 @@ def create_ui():
1568
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1569
 
1570
  examples = [
1571
- ["test_images/ambass.jpg"],
1572
- ["test_images/test1.jpg"],
1573
- ["test_images/test2.jpg"],
1574
- ["test_images/test3.jpg"],
1575
- ["test_images/test4.jpg"],
1576
- ["test_images/test5.jpg"],
1577
- ["test_images/Picture5.png"],
1578
 
1579
  ]
1580
 
@@ -1582,7 +1664,13 @@ def create_ui():
1582
  css=css,
1583
  theme=gr.themes.Base()
1584
  ) as iface:
 
1585
  state = gr.State([])
 
 
 
 
 
1586
  out_state = gr.State(None)
1587
  click_state = gr.State([[], [], []])
1588
  origin_image = gr.State(None)
@@ -1597,49 +1685,32 @@ def create_ui():
1597
  input_mask_state = gr.State(np.zeros((1, 1)))
1598
  input_points_state = gr.State([])
1599
  input_labels_state = gr.State([])
 
1600
  new_crop_save_path = gr.State(None)
1601
  image_input_nobackground = gr.State(None)
1602
  artist=gr.State(None)
1603
- like_res=gr.State([])
1604
- dislike_res=gr.State([])
1605
  gr.Markdown(title)
1606
  gr.Markdown(description)
1607
  point_prompt = gr.State("Positive")
1608
  log_list=gr.State([])
1609
  gender=gr.State('female')
 
1610
  image_path=gr.State('')
1611
- # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
1612
- # with gr.Column(scale=0.5):
1613
- # # gr.Markdown("Left side content")
1614
-
1615
- # with gr.Column(scale=0.5):
1616
- # with gr.Row(align="right",visible=False) as language_select:
1617
- # language = gr.Dropdown(
1618
- # ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1619
- # value="English", label="Language", interactive=True)
1620
-
1621
- # with gr.Row(align="right",visible=False) as autoplay:
1622
- # auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1623
- # output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1624
-
1625
-
1626
-
1627
-
1628
-
1629
-
1630
- # with gr.Row(align="right",visible=False) as language_select:
1631
- # language = gr.Dropdown(
1632
- # ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1633
- # value="English", label="Language", interactive=True)
1634
-
1635
- # with gr.Row(align="right",visible=False) as autoplay:
1636
- # auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1637
- # output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1638
-
1639
  with gr.Row():
1640
  with gr.Column(scale=6):
1641
  with gr.Column(visible=False) as modules_not_need_gpt:
1642
- with gr.Tab("Base(GPT Power)") as base_tab:
 
1643
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1644
  with gr.Row():
1645
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1647,7 +1718,7 @@ def create_ui():
1647
  year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1648
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1649
 
1650
- with gr.Tab("Base2") as base_tab2:
1651
  image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1652
  with gr.Row():
1653
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1665,6 +1736,12 @@ def create_ui():
1665
  artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1666
  year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1667
  material_label = gr.Button(value="Style: ",elem_classes="info_btn")
 
 
 
 
 
 
1668
 
1669
 
1670
  # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
@@ -1673,10 +1750,9 @@ def create_ui():
1673
  add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1674
  minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1675
  clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1676
- clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
1677
- focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
1678
- focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
1679
- focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
1680
  focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1681
 
1682
  recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
@@ -1743,28 +1819,13 @@ def create_ui():
1743
  value="No",
1744
  label="Expert",
1745
  interactive=True)
1746
-
1747
- with gr.Column(visible=False) as recommend:
1748
- gallery_result = gr.Gallery(
1749
- label="Result",
1750
- height="auto",
1751
- columns=4
1752
- # columns=4,
1753
- # rows=2,
1754
- # show_label=False,
1755
- # allow_preview=True,
1756
- # object_fit="contain",
1757
- # height="auto",
1758
- # preview=True,
1759
- # show_share_button=True,
1760
- # show_download_button=True
1761
- )
1762
-
1763
  with gr.Column(visible=True) as modules_not_need_gpt3:
1764
  gr.Examples(
1765
  examples=examples,
1766
  inputs=[example_image],
1767
  )
 
 
1768
 
1769
 
1770
 
@@ -1780,20 +1841,29 @@ def create_ui():
1780
  type="password")
1781
  with gr.Row():
1782
  enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
1783
- disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
1784
- variant='primary')
1785
  with gr.Column(visible=False) as module_notification_box:
1786
  notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
1787
 
1788
- with gr.Column() as modules_need_gpt0:
1789
- with gr.Column(visible=False) as modules_need_gpt2:
1790
- paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1791
- cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1792
 
1793
- with gr.Column(visible=False) as modules_not_need_gpt2:
 
 
 
 
 
 
 
 
 
1794
  with gr.Blocks():
1795
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1796
- with gr.Column(visible=False) as modules_need_gpt3:
1797
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1798
  with gr.Row():
1799
  clear_button_text = gr.Button(value="Clear Chat", interactive=True)
@@ -1801,13 +1871,9 @@ def create_ui():
1801
  # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1802
  # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
1803
  # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
1804
- with gr.Row():
1805
- naritive = gr.Radio(
1806
- choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
1807
- value="Third-person",
1808
- label="narritive",
1809
- scale=5,
1810
- interactive=True)
1811
 
1812
 
1813
  # TTS interface hidden initially
@@ -1824,10 +1890,46 @@ def create_ui():
1824
  with gr.Row():
1825
  submit_tts = gr.Button(value="Submit", interactive=True)
1826
  clear_tts = gr.Button(value="Clear", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1827
  ###############################################################################
1828
  ############# this part is for text to image #############
1829
  ###############################################################################
1830
-
1831
  with gr.Row(variant="panel",visible=False) as text2image_model:
1832
 
1833
  with gr.Column():
@@ -1922,10 +2024,14 @@ def create_ui():
1922
  # # show_download_button=True
1923
  # )
1924
 
1925
- with gr.Row():
1926
-
1927
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
1928
- with gr.Row(visible=False, elem_id="top_row") as top_row:
 
 
 
 
 
1929
  language = gr.Dropdown(
1930
  ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1931
  value="English", label="Language", interactive=True, elem_classes="custom-language"
@@ -1938,12 +2044,12 @@ def create_ui():
1938
  interactive=True,
1939
  label="Generated Caption Length",
1940
  )
1941
- auto_play = gr.Checkbox(
1942
- label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
1943
- )
1944
- output_audio = gr.HTML(
1945
- label="Synthesised Audio", elem_classes="custom-output"
1946
- )
1947
 
1948
 
1949
 
@@ -1984,14 +2090,14 @@ def create_ui():
1984
  # )
1985
  recommend_btn.click(
1986
  fn=infer,
1987
- inputs=[new_crop_save_path],
1988
  outputs=[gallery_result]
1989
  )
1990
 
1991
  gallery_result.select(
1992
  associate,
1993
- inputs=[paragraph,openai_api_key,language,state,auto_play,length],
1994
- outputs=[chatbot,state,output_audio],
1995
 
1996
 
1997
  )
@@ -2093,7 +2199,7 @@ def create_ui():
2093
 
2094
  # mv_images = gr.State()
2095
 
2096
- # chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
2097
 
2098
  # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
2099
  # fn=generate_mvs,
@@ -2138,81 +2244,80 @@ def create_ui():
2138
  # queue=False,
2139
  # show_progress=False
2140
  # )
 
 
 
 
 
 
2141
 
2142
 
2143
 
2144
 
2145
 
2146
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
2147
- outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
2148
- modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
2149
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
2150
- outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2151
  modules_not_need_gpt,
2152
- modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
2153
- # openai_api_key.submit(init_openai_api_key,
2154
- # outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
2155
- # modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
2156
- # enable_chatGPT_button.click(init_openai_api_key,
2157
- # outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2158
- # modules_not_need_gpt,
2159
- # modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
2160
-
2161
- disable_chatGPT_button.click(init_wo_openai_api_key,
2162
- outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
2163
- modules_not_need_gpt,
2164
- modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
2165
-
2166
- artist_label_base2.click(
2167
- get_artistinfo,
2168
- inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
2169
- outputs=[chatbot,state,output_audio]
2170
- )
2171
  artist_label.click(
2172
  get_artistinfo,
2173
- inputs=[artist_label,openai_api_key,state,language,auto_play,length],
2174
- outputs=[chatbot,state,output_audio]
2175
- )
2176
- artist_label_traj.click(
2177
- get_artistinfo,
2178
- inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
2179
- outputs=[chatbot,state,output_audio]
2180
  )
 
 
 
 
 
2181
 
2182
- year_label_base2.click(
2183
- get_yearinfo,
2184
- inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
2185
- outputs=[chatbot,state,output_audio]
2186
- )
2187
  year_label.click(
2188
  get_yearinfo,
2189
- inputs=[year_label,openai_api_key,state,language,auto_play,length],
2190
- outputs=[chatbot,state,output_audio]
2191
- )
2192
- year_label_traj.click(
2193
- get_yearinfo,
2194
- inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
2195
- outputs=[chatbot,state,output_audio]
2196
  )
 
 
 
 
 
2197
 
2198
 
2199
- enable_chatGPT_button.click(
2200
- lambda: (None, [], [], [[], [], []], "", "", ""),
2201
- [],
2202
- [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2203
- queue=False,
2204
- show_progress=False
2205
- )
2206
- openai_api_key.submit(
2207
- lambda: (None, [], [], [[], [], []], "", "", ""),
2208
- [],
2209
- [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2210
- queue=False,
2211
- show_progress=False
2212
- )
2213
 
2214
- cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2215
- [paragraph_output,output_audio])
2216
 
2217
  clear_button_click.click(
2218
  lambda x: ([[], [], []], x),
@@ -2222,53 +2327,61 @@ def create_ui():
2222
  show_progress=False
2223
  )
2224
  clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
2225
- clear_button_image.click(
2226
- lambda: (None, [], [], [[], [], []], "", "", ""),
2227
- [],
2228
- [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2229
- queue=False,
2230
- show_progress=False
2231
- )
2232
- clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
2233
  clear_button_text.click(
2234
- lambda: ([], [], [[], [], [], []]),
2235
  [],
2236
- [chatbot, state, click_state],
2237
  queue=False,
2238
  show_progress=False
2239
  )
2240
  clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
2241
 
2242
  image_input.clear(
2243
- lambda: (None, [], [], [[], [], []], "", "", ""),
2244
  [],
2245
- [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2246
  queue=False,
2247
  show_progress=False
2248
  )
2249
 
2250
  image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
 
 
 
 
 
 
 
 
2251
 
2252
 
2253
 
2254
 
2255
- image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
2256
- [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2257
- image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2258
- name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2259
- paragraph,artist,gender,image_path])
2260
 
2261
- image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
2262
- [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2263
- image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2264
- name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2265
- paragraph,artist,gender,image_path])
2266
 
2267
- image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
2268
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2269
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2270
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2271
- paragraph,artist,gender,image_path])
2272
 
2273
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2274
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2282,45 +2395,45 @@ def create_ui():
2282
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2283
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2284
  # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2285
- chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path],
2286
- [chatbot, state, aux_state,output_audio])
2287
  # chat_input.submit(lambda: "", None, chat_input)
2288
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2289
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2290
  # [chatbot, state, aux_state,output_audio])
2291
  # submit_button_text.click(lambda: "", None, chat_input)
2292
- example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key,language,naritive],
2293
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2294
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2295
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2296
- paragraph,artist,gender,image_path])
2297
 
2298
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2299
 
2300
- def on_click_tab_selected():
2301
- if gpt_state ==1:
2302
- print(gpt_state)
2303
- print("using gpt")
2304
- return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
2305
- else:
2306
- print("no gpt")
2307
- print("gpt_state",gpt_state)
2308
- return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
2309
-
2310
- def on_base_selected():
2311
- if gpt_state ==1:
2312
- print(gpt_state)
2313
- print("using gpt")
2314
- return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
2315
- else:
2316
- print("no gpt")
2317
- return [gr.update(visible=False)]*4
2318
-
2319
-
2320
- traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2321
- click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2322
- base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2323
- base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2324
 
2325
 
2326
 
@@ -2330,7 +2443,7 @@ def create_ui():
2330
  inputs=[
2331
  origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
2332
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
2333
- out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2334
  ],
2335
  outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
2336
  show_progress=False, queue=True
@@ -2341,10 +2454,10 @@ def create_ui():
2341
  submit_caption,
2342
  inputs=[
2343
  naritive, state,length, sentiment, factuality, language,
2344
- out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
2345
  ],
2346
  outputs=[
2347
- chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
2348
  ],
2349
  show_progress=True,
2350
  queue=True
@@ -2358,11 +2471,12 @@ def create_ui():
2358
  submit_caption,
2359
  inputs=[
2360
  naritive,state,length, sentiment, factuality, language,
2361
- out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
 
2362
  ],
2363
  outputs=[
2364
- chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
2365
- ],
2366
  show_progress=True,
2367
  queue=True
2368
  )
@@ -2373,10 +2487,10 @@ def create_ui():
2373
  inputs=[
2374
  naritive,state,length, sentiment, factuality, language,
2375
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2376
- auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
2377
  ],
2378
  outputs=[
2379
- chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
2380
  ],
2381
  show_progress=True,
2382
  queue=True
@@ -2388,10 +2502,10 @@ def create_ui():
2388
  inputs=[
2389
  naritive,state,length, sentiment, factuality, language,
2390
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2391
- auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
2392
  ],
2393
  outputs=[
2394
- chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
2395
  ],
2396
  show_progress=True,
2397
  queue=True
@@ -2431,20 +2545,26 @@ def create_ui():
2431
 
2432
  export_button.click(
2433
  export_chat_log,
2434
- inputs=[state,paragraph,like_res,dislike_res,log_list],
2435
  outputs=[chat_log_file,log_list],
2436
  queue=True
2437
  )
2438
 
2439
  naritive.change(
2440
- lambda: (None, [], [], [[], [], []], "", "", ""),
2441
- [],
2442
- [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2443
  queue=False,
2444
  show_progress=False
2445
 
2446
  )
2447
 
 
 
 
 
 
 
2448
  # upvote_btn.click(
2449
  # handle_liked,
2450
  # inputs=[state,like_res],
 
1
+ import datetime
2
  from io import BytesIO
3
  import io
4
  from math import inf
 
347
  return image_features
348
 
349
  @spaces.GPU
350
+ def infer(crop_image_path,full_image_path):
351
+ input_image = Image.open(crop_image_path).convert("RGB")
352
+ input_features = extract_features_siglip(input_image.convert("RGB"))
353
+ input_features = input_features.detach().cpu().numpy()
354
+ input_features = np.float32(input_features)
355
+ faiss.normalize_L2(input_features)
356
+ distances, indices = index.search(input_features, 2)
357
+ gallery_output = []
358
+ for i,v in enumerate(indices[0]):
359
+ sim = -distances[0][i]
360
+ image_url = df.iloc[v]["Link"]
361
+ img_retrieved = read_image_from_url(image_url)
362
+ gallery_output.append(img_retrieved)
363
+
364
+ input_image = Image.open(full_image_path).convert("RGB")
365
+ input_features = extract_features_siglip(input_image.convert("RGB"))
366
+ input_features = input_features.detach().cpu().numpy()
367
+ input_features = np.float32(input_features)
368
+ faiss.normalize_L2(input_features)
369
+ distances, indices = index.search(input_features, 2)
370
+ for i,v in enumerate(indices[0]):
371
+ sim = -distances[0][i]
372
+ image_url = df.iloc[v]["Link"]
373
+ img_retrieved = read_image_from_url(image_url)
374
+ gallery_output.append(img_retrieved)
375
+
376
+ return gallery_output
377
 
378
 
379
  ###############################################################################
 
561
  'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
562
  }
563
 
564
+ focus_map = {
565
+ "Describe":0,
566
+ "D+Analysis":1,
567
+ "DA+Interprete":2,
568
  "Judge":3
569
  }
570
 
 
 
 
 
 
 
 
571
 
 
 
 
 
 
 
 
572
  prompt_list = [
573
  [
574
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
 
722
  global gpt_state
723
  gpt_state=1
724
  # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
725
+ return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3+[gr.update(visible=False)]
726
  else:
727
  gpt_state=0
728
  # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
729
+ return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*4
730
 
731
  def init_wo_openai_api_key():
732
  global gpt_state
 
734
  # return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
735
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
736
 
737
+
738
  def get_click_prompt(chat_input, click_state, click_mode):
739
  inputs = json.loads(chat_input)
740
  if click_mode == 'Continuous':
 
772
  raise NotImplementedError
773
 
774
  async def chat_input_callback(*args):
775
+ visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history = args
776
  message = chat_input["text"]
777
  prompt="Please help me answer the question with this painting {question} in {language}."
778
  prompt=prompt.format(question=message, language=language)
779
+
780
  if visual_chatgpt is not None:
781
+ result=get_gpt_response(api_key, image_input,prompt+message,history)
 
782
  read_info = re.sub(r'[#[\]!*]','',result)
783
  read_info = emoji.replace_emoji(read_info,replace="")
784
+ state = state + [(message,result)]
785
+ log_state += [(message,result)]
786
+ # log_state += [("%% chat messahe %%",None)]
787
+
788
+ history.append({"role": "user", "content": message})
789
+ history.append({"role": "assistant", "content": result})
790
+
791
  if autoplay==False:
792
+ return state, state, aux_state, None,log_state,history
793
 
794
  else:
795
+ audio = await texttospeech(read_info,language,gender)
796
+ return state, state, aux_state, audio,log_state,history
797
  else:
798
  response = "Text refiner is not initilzed, please input openai api key."
799
  state = state + [(chat_input, response)]
800
+ audio = await texttospeech(response,language,gender)
801
+ return state, state, None, audio,log_state,history
 
802
 
803
 
804
+ def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None):
805
  print("narritive", narritive)
806
+
807
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
808
  image_input = image_input['background']
809
 
 
814
 
815
 
816
  click_state = [[], [], []]
817
+
818
+
819
+ # width, height = image_input.size
820
+
821
+ # target_width=500
822
+ # target_height=650
823
+
824
+ # width_ratio = target_width / width
825
+ # height_ratio = target_height / height
826
+ # ratio = min(width_ratio, height_ratio)
827
+
828
+ # if ratio < 1.0:
829
+ # new_size = (int(width * ratio), int(height * ratio))
830
+ # image_input = image_input.resize(new_size, Image.ANTIALIAS)
831
+
832
  image_input = image_resize(image_input, res=1024)
833
 
834
  model = build_caption_anything_with_models(
 
850
  image_input.save(new_image_path)
851
  visual_chatgpt.current_image = new_image_path
852
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
 
 
 
 
 
 
853
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
854
  parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
855
+ print(parsed_data)
856
  parsed_data = json.loads(parsed_data.replace("'", "\""))
857
  name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
858
  gender=gender.lower()
859
  print("gender",gender)
860
+
861
 
862
 
863
  if language=="English":
 
903
  None,
904
  f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
905
  )
906
+ ]
907
+
908
+ log_state += [(name,None)]
909
+ log_state=log_state+[(paragraph,None)]
910
+ log_state=log_state+[(narritive,None)]
911
+ log_state=log_state+state
912
+ log_state = log_state + [("%% basic information %%", None)]
913
+
914
+ history=[]
915
+ history.append({"role": "assistant", "content": paragraph+state[0][1]})
916
 
917
+
 
918
 
919
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
920
+ original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
921
 
922
 
923
 
 
956
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
957
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
958
  # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
 
 
 
 
 
959
 
960
+
961
 
962
+ print(prompt["input_label"][-1])
963
+ if language=="English":
964
+ if prompt["input_label"][-1]==1:
965
+ msg="You've added an area at {}. ".format(prompt["input_point"][-1])
966
+ else:
967
+ msg="You've removed an area at {}. ".format(prompt["input_point"][-1])
968
+ else:
969
+ if prompt["input_label"][-1]==1:
970
+ msg="你添加了在 {} 的区域。 ".format(prompt["input_point"][-1])
971
+ else:
972
+ msg="你删除了在 {} 的区域。 ".format(prompt["input_point"][-1])
973
 
974
+ state = state + [(msg, None)]
975
+
976
  input_mask = np.array(out['mask'].convert('P'))
977
  image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
978
 
 
983
  out_state = out
984
 
985
  if visual_chatgpt is not None:
 
986
  new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
987
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
988
+ print("new crop save",new_crop_save_path)
 
 
 
 
989
 
990
  yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
991
 
992
 
993
+ query_focus_en = [
994
+ "Provide a description of the item.",
995
+ "Provide a description and analysis of the item.",
996
+ "Provide a description, analysis, and interpretation of the item.",
997
+ "Evaluate the item."
998
+ ]
999
+
1000
+ query_focus_zh = [
1001
+ "请描述一下这个物体。",
1002
+ "请描述和分析一下这个物体。",
1003
+ "请描述、分析和解释一下这个物体。",
1004
+ "请以艺术鉴赏的角度评价一下这个物体。"
1005
+ ]
1006
 
1007
 
1008
  async def submit_caption(naritive, state,length, sentiment, factuality, language,
1009
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1010
+ autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender,log_state,history):
1011
 
1012
 
1013
+ focus_value=focus_map[focus_type]
 
1014
  click_index = click_index_state
1015
 
 
 
 
 
 
1016
  print("click_index",click_index)
1017
  print("input_points_state",input_points_state)
1018
  print("input_labels_state",input_labels_state)
1019
 
1020
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
1021
 
1022
+ log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
1023
+
1024
+
1025
  print("Prompt:", prompt)
1026
  print("click",click_index)
1027
 
1028
+ log_state = log_state + [(naritive, None)]
1029
+
 
1030
 
1031
  # if not args.disable_gpt and text_refiner:
1032
  if not args.disable_gpt:
1033
  print("new crop save",new_crop_save_path)
1034
+ focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt,history)
1035
  if focus_info.startswith('"') and focus_info.endswith('"'):
1036
  focus_info=focus_info[1:-1]
1037
  focus_info=focus_info.replace('#', '')
1038
  # state = state + [(None, f"Wiki: {paragraph}")]
1039
+ if language=="English":
1040
+ user_query=query_focus_en[focus_value]
1041
+
1042
+ else:
1043
+ user_query=query_focus_zh[focus_value]
1044
+
1045
+ state = state + [(user_query, f"{focus_info}")]
1046
+ log_state = log_state + [(user_query, None)]
1047
+ log_state = log_state + [(None, f"{focus_info}")]
1048
+
1049
+ # save history
1050
+ history.append({"role": "user", "content": user_query})
1051
+ history.append({"role": "assistant", "content": focus_info})
1052
+
1053
+
1054
+
1055
  print("new_cap",focus_info)
1056
  read_info = re.sub(r'[#[\]!*]','',focus_info)
1057
  read_info = emoji.replace_emoji(read_info,replace="")
 
1067
  print("error gpt responese")
1068
  print("item gender",gender)
1069
 
 
 
1070
  try:
1071
  if autoplay==False:
1072
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,log_state,history
1073
 
1074
+ audio_output = await texttospeech(read_info, language,gender)
1075
  print("done")
1076
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1077
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
1078
 
1079
  except Exception as e:
1080
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1081
  print(f"Error during TTS prediction: {str(e)}")
1082
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
1083
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
1084
 
1085
  else:
1086
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1087
  print(f"Error during TTS prediction: {str(e)}")
1088
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1089
 
1090
 
1091
 
 
1125
  with open(image_path, "rb") as image_file:
1126
  return base64.b64encode(image_file.read()).decode('utf-8')
1127
 
1128
+ def get_gpt_response(api_key, image_path, prompt, history=None):
 
 
 
 
1129
 
1130
  headers = {
1131
  "Content-Type": "application/json",
1132
  "Authorization": f"Bearer {api_key}"
1133
  }
 
1134
 
1135
+ if history:
1136
+ if len(history) > 4:
1137
+ history = history[-4:]
1138
+ else:
1139
+ history = []
1140
+
1141
+ messages = history[:]
1142
+
1143
  if image_path:
1144
+ base64_image = encode_image(image_path)
1145
+ messages.append({
1146
+ "role": "user",
1147
+ "content": [
 
 
 
 
 
 
 
 
1148
  {
1149
+ "type": "text",
1150
+ "text": prompt
1151
+ },
1152
+ {
1153
+ "type": "image_url",
1154
+ "image_url": {
1155
+ "url": f"data:image/jpeg;base64,{base64_image}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1156
  }
1157
+ }
1158
+ ]
1159
+ })
1160
+ else:
1161
+ messages.append({"role": "user",
1162
+ "content":
1163
+ {
1164
+ "type": "text",
1165
+ "text": prompt
1166
+ }})
1167
+
1168
+ payload = {
1169
+ "model": "gpt-4o",
1170
+ "messages": messages,
1171
+ "max_tokens": 600
1172
+ }
1173
+
1174
 
1175
  # Sending the request to the OpenAI API
1176
  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
1177
  result = response.json()
1178
+ print("gpt result",result)
1179
  try:
1180
  content = result['choices'][0]['message']['content']
1181
  return content
 
1319
  visual_chatgpt.global_prompt = ""
1320
 
1321
 
1322
+ def export_chat_log(chat_state,log_list,narrative):
1323
  try:
1324
+ chat_log=""
1325
  if not chat_state:
1326
  return None
 
1327
  for entry in chat_state:
1328
  user_message, bot_response = entry
1329
  if user_message and bot_response:
1330
  chat_log += f"User: {user_message}\nBot: {bot_response}\n"
1331
+ elif user_message and user_message.startswith("%%"):
1332
+ chat_log += f"{user_message}\n"
1333
  elif user_message:
1334
  chat_log += f"User: {user_message}\n"
1335
+ chat_log += f"///// \n"
1336
  elif bot_response:
1337
  chat_log += f"Bot: {bot_response}\n"
1338
+ chat_log += f"///// \n"
1339
 
1340
+ print("export log...")
1341
+ current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
1342
+ file_name = f"{current_time}_{narrative}.txt"
1343
+ file_path = os.path.join(os.getcwd(), file_name) # Save to the current working directory
1344
 
1345
+ with open(file_path, 'w', encoding='utf-8') as file:
1346
+ file.write(chat_log)
 
1347
 
1348
+ print(file_path)
1349
+
1350
+ log_list.append(file_path)
 
 
 
 
1351
  return log_list,log_list
1352
  except Exception as e:
1353
  print(f"An error occurred while exporting the chat log: {e}")
1354
+ return None,None
1355
 
1356
+ async def get_artistinfo(artist_name,api_key,state,language,autoplay,length,log_state):
1357
  prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
1358
  res=get_gpt_response(api_key,None,prompt)
1359
  state = state + [(None, res)]
1360
  read_info = re.sub(r'[#[\]!*]','',res)
1361
  read_info = emoji.replace_emoji(read_info,replace="")
1362
+ log_state=log_state+[(f"res", None)]
1363
 
1364
 
1365
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1366
  # input_points=input_points, input_labels=input_labels)
1367
  if autoplay:
1368
+ audio_output = await texttospeech(read_info, language)
1369
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1370
+ return state, state,audio_output,log_state
1371
+ return state, state,None,log_state
1372
 
1373
 
1374
+ async def get_yearinfo(year,api_key,state,language,autoplay,length,log_state):
1375
  prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
1376
  res=get_gpt_response(api_key,None,prompt)
1377
+ log_state=log_state+[(f"res", None)]
1378
  state = state + [(None, res)]
1379
  read_info = re.sub(r'[#[\]!*]','',res)
1380
  read_info = emoji.replace_emoji(read_info,replace="")
 
1383
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1384
  # input_points=input_points, input_labels=input_labels)
1385
  if autoplay:
1386
+ audio_output = await texttospeech(read_info, language)
1387
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1388
+ return state, state,audio_output,log_state
1389
+ return state, state,None,log_state
1390
 
1391
 
1392
 
1393
 
1394
 
1395
+ # async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
1396
 
1397
+ # # state = state + [(None, f"Caption Everything: {paragraph}")]
1398
+ # Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1399
+ # AI_prompt = "Received."
1400
+ # visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
1401
 
1402
+ # # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1403
+ # visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
1404
+ # # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
1405
+ # audio_output=await texttospeech(paragraph,language,autoplay)
1406
+ # return paragraph,audio_output
1407
 
1408
+ # def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
1409
 
1410
+ # model = build_caption_anything_with_models(
1411
+ # args,
1412
+ # api_key="",
1413
+ # captioner=shared_captioner,
1414
+ # sam_model=shared_sam_model,
1415
+ # ocr_reader=shared_ocr_reader,
1416
+ # text_refiner=text_refiner,
1417
+ # session_id=iface.app_id
1418
+ # )
1419
+ # paragraph = model.inference_cap_everything(image_input, verbose=True)
1420
+ # # state = state + [(None, f"Caption Everything: {paragraph}")]
1421
+ # Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1422
+ # AI_prompt = "Received."
1423
+ # visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
1424
+ # visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
1425
+ # # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1426
+ # return paragraph
1427
 
1428
 
1429
 
 
1515
 
1516
  # return like_state, dislike_state
1517
 
1518
+ async def texttospeech(text, language,gender='female'):
1519
  try:
1520
+
1521
+ voice = filtered_language_dict[language][gender]
1522
+ communicate = edge_tts.Communicate(text=text, voice=voice,rate="+25%")
1523
+ file_path = "output.wav"
1524
+ await communicate.save(file_path)
1525
+ with open(file_path, "rb") as audio_file:
1526
+ audio_bytes = BytesIO(audio_file.read())
1527
+ audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
1528
+ print("TTS processing completed.")
1529
+ audio_style = 'style="width:210px;"'
1530
+ audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
 
 
 
1531
  return audio_player
1532
+
1533
  except Exception as e:
1534
  print(f"Error in texttospeech: {e}")
1535
  return None
1536
 
1537
+ # give the reason of recommendation
1538
+ async def associate(focus_info,openai_api_key,language,autoplay,length,log_state,sort_score,evt: gr.SelectData,narritive=None):
1539
  rec_path=evt._data['value']['image']['path']
1540
+ index=evt.index
1541
  print("rec_path",rec_path)
1542
  prompt="""
1543
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
1544
  """
1545
  prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
1546
  result=get_gpt_response(openai_api_key, rec_path, prompt)
1547
+ print("recommend result",result)
1548
+ reason = [(None, f"{result}")]
1549
+ log_state = log_state + [(narritive, None)]
1550
+ log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1551
+ log_state = log_state + [(None, f"{result}")]
1552
  read_info = re.sub(r'[#[\]!*]','',result)
1553
  read_info = emoji.replace_emoji(read_info,replace="")
1554
  print("associate",read_info)
1555
  if autoplay:
1556
+ audio_output = await texttospeech(read_info, language)
1557
+ return reason,audio_output,log_state,index
1558
+ return reason,None,log_state,index
 
 
1559
 
1560
+ def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1561
+ if task_type=="Session 1":
1562
+ return None, [], [], [[], [], []], "", None, []
1563
+ else:
1564
+ if language=="English":
1565
+ if narritive=="Third-person" :
1566
+ state += [
1567
+ (
1568
+ None,
1569
+ f"🤖 Hi, I am EyeSee. Let's explore this painting together."
1570
+ )
1571
+ ]
1572
+ elif narritive=="Single-Persona: Artist":
1573
+ state += [
1574
+ (
1575
+ None,
1576
+ f"🧑‍🎨 Let's delve into it from the perspective of the artist."
1577
+ )
1578
+ ]
1579
+ elif narritive=="Multi-Persona: Objects":
1580
+ state += [
1581
+ (
1582
+ None,
1583
+ f"🎨 Let's delve into it from the perspective of the objects depicted in the scene."
1584
+ )
1585
+ ]
1586
+ elif language=="Chinese":
1587
+ if narritive=="Third-person" :
1588
+ state += [
1589
+ (
1590
+ None,
1591
+ "🤖 让我们从第三方视角一起探索这幅画吧。"
1592
+ )
1593
+ ]
1594
+ elif narritive == "Single-Persona: Artist":
1595
+ state += [
1596
+ (
1597
+ None,
1598
+ "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
1599
+ )
1600
+ ]
1601
+ elif narritive == "Multi-Persona: Objects":
1602
+ state += [
1603
+ (
1604
+ None,
1605
+ "🎨 让我们从画面中事物的视角深入探索这幅画。"
1606
+ )
1607
+ ]
1608
 
1609
+ return image_input, state, state, click_state, paragraph, origin_image
1610
 
1611
 
1612
+ def print_like_dislike(x: gr.LikeData,state,log_state):
1613
  print(x.index, x.value, x.liked)
1614
  if x.liked == True:
1615
  print("liked")
1616
+ log_state=log_state+[(f"User liked this message", None)]
 
1617
  state = state + [(None, f"Liked Received 👍")]
1618
  else:
1619
+ log_state=log_state+[(f"User disliked this message", None)]
1620
  state = state + [(None, f"Disliked Received 👎")]
1621
+ log_state+=[("%% user interaction %%", None)]
1622
+ return log_state,state
1623
 
1624
+ def get_recommendationscore(index,score,log_state):
1625
+ log_state+=[(f"Picture {index} : {score}",None)]
1626
+ log_state+=[("%% recommendation %%",None)]
1627
+ return log_state
1628
+
1629
+
1630
+
1631
 
1632
  def toggle_icons_and_update_prompt(point_prompt):
1633
  new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
 
1650
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1651
 
1652
  examples = [
1653
+ ["test_images/1.The Ambassadors.jpg"],
1654
+ ["test_images/2.Football Players.jpg"],
1655
+ ["test_images/3.Along the River during the Qingming Festival.jpeg"],
1656
+ # ["test_images/test3.jpg"],
1657
+ # ["test_images/test4.jpg"],
1658
+ # ["test_images/test5.jpg"],
1659
+ # ["test_images/Picture5.png"],
1660
 
1661
  ]
1662
 
 
1664
  css=css,
1665
  theme=gr.themes.Base()
1666
  ) as iface:
1667
+ #display in the chatbox
1668
  state = gr.State([])
1669
+ # expoer in log
1670
+ log_state=gr.State([])
1671
+ # history log for gpt
1672
+ history_log=gr.State([])
1673
+
1674
  out_state = gr.State(None)
1675
  click_state = gr.State([[], [], []])
1676
  origin_image = gr.State(None)
 
1685
  input_mask_state = gr.State(np.zeros((1, 1)))
1686
  input_points_state = gr.State([])
1687
  input_labels_state = gr.State([])
1688
+ #store the selected image
1689
  new_crop_save_path = gr.State(None)
1690
  image_input_nobackground = gr.State(None)
1691
  artist=gr.State(None)
 
 
1692
  gr.Markdown(title)
1693
  gr.Markdown(description)
1694
  point_prompt = gr.State("Positive")
1695
  log_list=gr.State([])
1696
  gender=gr.State('female')
1697
+ # store the whole image path
1698
  image_path=gr.State('')
1699
+ pic_index=gr.State(None)
1700
+
1701
+
1702
+ with gr.Row():
1703
+ auto_play = gr.Checkbox(
1704
+ label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
1705
+ )
1706
+ output_audio = gr.HTML(
1707
+ label="Synthesised Audio", elem_classes="custom-output"
1708
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709
  with gr.Row():
1710
  with gr.Column(scale=6):
1711
  with gr.Column(visible=False) as modules_not_need_gpt:
1712
+
1713
+ with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1714
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1715
  with gr.Row():
1716
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
 
1718
  year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1719
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1720
 
1721
+ with gr.Tab("Base2",visible=False) as base_tab2:
1722
  image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1723
  with gr.Row():
1724
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
 
1736
  artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1737
  year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1738
  material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1739
+
1740
+ with gr.Row():
1741
+ gr.Examples(
1742
+ examples=examples,
1743
+ inputs=[example_image],
1744
+ )
1745
 
1746
 
1747
  # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
 
1750
  add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1751
  minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1752
  clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1753
+ focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
1754
+ focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
1755
+ focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
 
1756
  focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1757
 
1758
  recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
 
1819
  value="No",
1820
  label="Expert",
1821
  interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1822
  with gr.Column(visible=True) as modules_not_need_gpt3:
1823
  gr.Examples(
1824
  examples=examples,
1825
  inputs=[example_image],
1826
  )
1827
+
1828
+
1829
 
1830
 
1831
 
 
1841
  type="password")
1842
  with gr.Row():
1843
  enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
1844
+ # disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
1845
+ # variant='primary')
1846
  with gr.Column(visible=False) as module_notification_box:
1847
  notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
1848
 
1849
+ # with gr.Column() as modules_need_gpt0:
1850
+ # with gr.Column(visible=False) as modules_need_gpt2:
1851
+ # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1852
+ # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1853
 
1854
+
1855
+
1856
+ with gr.Column(visible=False) as modules_not_need_gpt2:
1857
+ with gr.Row():
1858
+ naritive = gr.Radio(
1859
+ choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
1860
+ value="Third-person",
1861
+ label="Persona",
1862
+ scale=5,
1863
+ interactive=True)
1864
  with gr.Blocks():
1865
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1866
+ with gr.Column() as modules_need_gpt3:
1867
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1868
  with gr.Row():
1869
  clear_button_text = gr.Button(value="Clear Chat", interactive=True)
 
1871
  # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1872
  # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
1873
  # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
1874
+
1875
+
1876
+
 
 
 
 
1877
 
1878
 
1879
  # TTS interface hidden initially
 
1890
  with gr.Row():
1891
  submit_tts = gr.Button(value="Submit", interactive=True)
1892
  clear_tts = gr.Button(value="Clear", interactive=True)
1893
+
1894
+ with gr.Row():
1895
+ with gr.Column(scale=6):
1896
+ with gr.Column(visible=False) as recommend:
1897
+ gallery_result = gr.Gallery(
1898
+ label="Recommendations",
1899
+ height="auto",
1900
+ columns=4
1901
+ # columns=4,
1902
+ # rows=2,
1903
+ # show_label=False,
1904
+ # allow_preview=True,
1905
+ # object_fit="contain",
1906
+ # height="auto",
1907
+ # preview=True,
1908
+ # show_share_button=True,
1909
+ # show_download_button=True
1910
+ )
1911
+ sort_rec=gr.Dropdown(["1", "2", "3", "4"],
1912
+ value=[],
1913
+ multiselect=True,
1914
+ label="Score", info="Please sort the pictures according to your preference"
1915
+ )
1916
+
1917
+ with gr.Column(scale=4,visible=False) as reco_reasons:
1918
+ recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
1919
+ recommend_score = gr.Radio(
1920
+ choices=[0,1,2,3,4,5],
1921
+ label="Score",
1922
+ interactive=True)
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
  ###############################################################################
1930
  ############# this part is for text to image #############
1931
  ###############################################################################
1932
+
1933
  with gr.Row(variant="panel",visible=False) as text2image_model:
1934
 
1935
  with gr.Column():
 
2024
  # # show_download_button=True
2025
  # )
2026
 
2027
+ with gr.Row(visible=False) as export:
 
2028
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
2029
+
2030
+ with gr.Row(elem_id="top_row",visible=False) as top_row:
2031
+ task_type = gr.Dropdown(
2032
+ ["Session 1","Session 2"],
2033
+ value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
2034
+ )
2035
  language = gr.Dropdown(
2036
  ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
2037
  value="English", label="Language", interactive=True, elem_classes="custom-language"
 
2044
  interactive=True,
2045
  label="Generated Caption Length",
2046
  )
2047
+ # auto_play = gr.Checkbox(
2048
+ # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2049
+ # )
2050
+ # output_audio = gr.HTML(
2051
+ # label="Synthesised Audio", elem_classes="custom-output"
2052
+ # )
2053
 
2054
 
2055
 
 
2090
  # )
2091
  recommend_btn.click(
2092
  fn=infer,
2093
+ inputs=[new_crop_save_path,image_path],
2094
  outputs=[gallery_result]
2095
  )
2096
 
2097
  gallery_result.select(
2098
  associate,
2099
+ inputs=[paragraph,openai_api_key,language,auto_play,length,log_state,sort_rec],
2100
+ outputs=[recommend_bot,output_audio,log_state,pic_index],
2101
 
2102
 
2103
  )
 
2199
 
2200
  # mv_images = gr.State()
2201
 
2202
+ chatbot.like(print_like_dislike, inputs=[state,log_state], outputs=[log_state,chatbot])
2203
 
2204
  # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
2205
  # fn=generate_mvs,
 
2244
  # queue=False,
2245
  # show_progress=False
2246
  # )
2247
+
2248
+ recommend_score.select(
2249
+ get_recommendationscore,
2250
+ inputs=[pic_index,recommend_score,log_state],
2251
+ outputs=[log_state],
2252
+ )
2253
 
2254
 
2255
 
2256
 
2257
 
2258
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
2259
+ outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
2260
+ modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
2261
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
2262
+ outputs=[export,modules_need_gpt1, modules_need_gpt3,
2263
  modules_not_need_gpt,
2264
+ modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
2265
+
2266
+ # disable_chatGPT_button.click(init_wo_openai_api_key,
2267
+ # outputs=[export,modules_need_gpt1, modules_need_gpt3,
2268
+ # modules_not_need_gpt,
2269
+ # modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
2270
+
2271
+ # artist_label_base2.click(
2272
+ # get_artistinfo,
2273
+ # inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
2274
+ # outputs=[chatbot,state,output_audio]
2275
+ # )
 
 
 
 
 
 
 
2276
  artist_label.click(
2277
  get_artistinfo,
2278
+ inputs=[artist_label,openai_api_key,state,language,auto_play,length,log_state],
2279
+ outputs=[chatbot,state,output_audio,log_state]
 
 
 
 
 
2280
  )
2281
+ # artist_label_traj.click(
2282
+ # get_artistinfo,
2283
+ # inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
2284
+ # outputs=[chatbot,state,output_audio]
2285
+ # )
2286
 
2287
+ # year_label_base2.click(
2288
+ # get_yearinfo,
2289
+ # inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
2290
+ # outputs=[chatbot,state,output_audio]
2291
+ # )
2292
  year_label.click(
2293
  get_yearinfo,
2294
+ inputs=[year_label,openai_api_key,state,language,auto_play,length,log_state],
2295
+ outputs=[chatbot,state,output_audio,log_state]
 
 
 
 
 
2296
  )
2297
+ # year_label_traj.click(
2298
+ # get_yearinfo,
2299
+ # inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
2300
+ # outputs=[chatbot,state,output_audio]
2301
+ # )
2302
 
2303
 
2304
+ # enable_chatGPT_button.click(
2305
+ # lambda: (None, [], [], [[], [], []], "", "", ""),
2306
+ # [],
2307
+ # [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2308
+ # queue=False,
2309
+ # show_progress=False
2310
+ # )
2311
+ # openai_api_key.submit(
2312
+ # lambda: (None, [], [], [[], [], []], "", "", ""),
2313
+ # [],
2314
+ # [image_input, chatbot, state, click_state, paragraph_output, origin_image],
2315
+ # queue=False,
2316
+ # show_progress=False
2317
+ # )
2318
 
2319
+ # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2320
+ # [paragraph_output,output_audio])
2321
 
2322
  clear_button_click.click(
2323
  lambda x: ([[], [], []], x),
 
2327
  show_progress=False
2328
  )
2329
  clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
2330
+ # clear_button_image.click(
2331
+ # lambda: (None, [], [], [[], [], []], "", "", ""),
2332
+ # [],
2333
+ # [image_input, chatbot, state, click_state, paragraph, origin_image],
2334
+ # queue=False,
2335
+ # show_progress=False
2336
+ # )
2337
+ # clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
2338
  clear_button_text.click(
2339
+ lambda: ([], [], [[], [], [], []],[]),
2340
  [],
2341
+ [chatbot, state, click_state,history_log],
2342
  queue=False,
2343
  show_progress=False
2344
  )
2345
  clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
2346
 
2347
  image_input.clear(
2348
+ lambda: (None, [], [], [[], [], []], "", None, []),
2349
  [],
2350
+ [image_input, chatbot, state, click_state, paragraph, origin_image,history_log],
2351
  queue=False,
2352
  show_progress=False
2353
  )
2354
 
2355
  image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
2356
+
2357
+ # image_input.change(
2358
+ # lambda: ([], [], [[], [], []], [], []),
2359
+ # [],
2360
+ # [chatbot, state, click_state, history_log, log_state],
2361
+ # queue=False,
2362
+ # show_progress=False
2363
+ # )
2364
 
2365
 
2366
 
2367
 
2368
+ # image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
2369
+ # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2370
+ # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2371
+ # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2372
+ # paragraph,artist,gender,image_path])
2373
 
2374
+ # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
2375
+ # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2376
+ # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2377
+ # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2378
+ # paragraph,artist,gender,image_path])
2379
 
2380
+ image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
2381
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2382
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2383
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2384
+ paragraph,artist,gender,image_path,log_state,history_log])
2385
 
2386
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2387
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2395
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2396
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2397
  # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2398
+ chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log],
2399
+ [chatbot, state, aux_state,output_audio,log_state,history_log])
2400
  # chat_input.submit(lambda: "", None, chat_input)
2401
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2402
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2403
  # [chatbot, state, aux_state,output_audio])
2404
  # submit_button_text.click(lambda: "", None, chat_input)
2405
+ example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
2406
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2407
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2408
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2409
+ paragraph,artist,gender,image_path, log_state,history_log])
2410
 
2411
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2412
 
2413
+ # def on_click_tab_selected():
2414
+ # if gpt_state ==1:
2415
+ # print(gpt_state)
2416
+ # print("using gpt")
2417
+ # return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
2418
+ # else:
2419
+ # print("no gpt")
2420
+ # print("gpt_state",gpt_state)
2421
+ # return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
2422
+
2423
+ # def on_base_selected():
2424
+ # if gpt_state ==1:
2425
+ # print(gpt_state)
2426
+ # print("using gpt")
2427
+ # return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
2428
+ # else:
2429
+ # print("no gpt")
2430
+ # return [gr.update(visible=False)]*4
2431
+
2432
+
2433
+ # traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2434
+ # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2435
+ # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2436
+ # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2437
 
2438
 
2439
 
 
2443
  inputs=[
2444
  origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
2445
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
2446
+ out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
2447
  ],
2448
  outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
2449
  show_progress=False, queue=True
 
2454
  submit_caption,
2455
  inputs=[
2456
  naritive, state,length, sentiment, factuality, language,
2457
+ out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender,log_state,history_log
2458
  ],
2459
  outputs=[
2460
+ chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
2461
  ],
2462
  show_progress=True,
2463
  queue=True
 
2471
  submit_caption,
2472
  inputs=[
2473
  naritive,state,length, sentiment, factuality, language,
2474
+ out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path,gender,log_state,
2475
+ history_log
2476
  ],
2477
  outputs=[
2478
+ chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
2479
+ ],
2480
  show_progress=True,
2481
  queue=True
2482
  )
 
2487
  inputs=[
2488
  naritive,state,length, sentiment, factuality, language,
2489
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2490
+ auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path,gender,log_state,history_log
2491
  ],
2492
  outputs=[
2493
+ chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
2494
  ],
2495
  show_progress=True,
2496
  queue=True
 
2502
  inputs=[
2503
  naritive,state,length, sentiment, factuality, language,
2504
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
2505
+ auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path,gender,log_state,history_log
2506
  ],
2507
  outputs=[
2508
+ chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
2509
  ],
2510
  show_progress=True,
2511
  queue=True
 
2545
 
2546
  export_button.click(
2547
  export_chat_log,
2548
+ inputs=[log_state,log_list,naritive],
2549
  outputs=[chat_log_file,log_list],
2550
  queue=True
2551
  )
2552
 
2553
  naritive.change(
2554
+ change_naritive,
2555
+ [task_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
2556
+ [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
2557
  queue=False,
2558
  show_progress=False
2559
 
2560
  )
2561
 
2562
+ task_type.change(
2563
+ lambda: ([]),
2564
+ [],
2565
+ [log_state]
2566
+ )
2567
+
2568
  # upvote_btn.click(
2569
  # handle_liked,
2570
  # inputs=[state,like_res],