Niki Zhang commited on
Commit
abb5985
·
verified ·
1 Parent(s): 1db79b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -123
app.py CHANGED
@@ -347,33 +347,59 @@ def extract_features_siglip(image):
347
  return image_features
348
 
349
  @spaces.GPU
350
- def infer(crop_image_path,full_image_path):
351
- input_image = Image.open(crop_image_path).convert("RGB")
352
- input_features = extract_features_siglip(input_image.convert("RGB"))
353
- input_features = input_features.detach().cpu().numpy()
354
- input_features = np.float32(input_features)
355
- faiss.normalize_L2(input_features)
356
- distances, indices = index.search(input_features, 2)
357
  gallery_output = []
358
- for i,v in enumerate(indices[0]):
359
- sim = -distances[0][i]
360
- image_url = df.iloc[v]["Link"]
361
- img_retrieved = read_image_from_url(image_url)
362
- gallery_output.append(img_retrieved)
363
-
364
- input_image = Image.open(full_image_path).convert("RGB")
365
- input_features = extract_features_siglip(input_image.convert("RGB"))
366
- input_features = input_features.detach().cpu().numpy()
367
- input_features = np.float32(input_features)
368
- faiss.normalize_L2(input_features)
369
- distances, indices = index.search(input_features, 2)
370
- for i,v in enumerate(indices[0]):
371
- sim = -distances[0][i]
372
- image_url = df.iloc[v]["Link"]
373
- img_retrieved = read_image_from_url(image_url)
374
- gallery_output.append(img_retrieved)
375
-
376
- return gallery_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
 
379
  ###############################################################################
@@ -530,11 +556,17 @@ css = """
530
  background: white !important;
531
  border: none !important;
532
  box-shadow: none !important;
 
 
 
533
  }
534
 
535
- info_btn_interact {
536
- background: white !important;
537
  box-shadow: none !important;
 
 
 
538
  }
539
 
540
  .function_button {
@@ -590,7 +622,27 @@ prompt_list = [
590
  ]
591
  ]
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
 
 
594
 
595
  gpt_state = 0
596
  VOICE = "en-GB-SoniaNeural"
@@ -722,11 +774,11 @@ def init_openai_api_key(api_key=""):
722
  global gpt_state
723
  gpt_state=1
724
  # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
725
- return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3+[gr.update(visible=False)]
726
  else:
727
  gpt_state=0
728
  # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
729
- return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*4
730
 
731
  def init_wo_openai_api_key():
732
  global gpt_state
@@ -801,9 +853,9 @@ async def chat_input_callback(*args):
801
  return state, state, None, audio,log_state,history
802
 
803
 
804
- def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None):
805
  print("narritive", narritive)
806
-
807
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
808
  image_input = image_input['background']
809
 
@@ -848,76 +900,60 @@ def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_a
848
  print('upload_callback: add caption to chatGPT memory')
849
  new_image_path = get_new_image_name('chat_image', func_name='upload')
850
  image_input.save(new_image_path)
 
851
  visual_chatgpt.current_image = new_image_path
852
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
853
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
854
- parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
855
  print(parsed_data)
856
  parsed_data = json.loads(parsed_data.replace("'", "\""))
857
  name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
858
  gender=gender.lower()
859
  print("gender",gender)
860
 
 
861
 
862
 
863
  if language=="English":
864
- if narritive=="Third-person" :
865
- state = [
866
- (
867
- None,
868
- f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
869
- )
870
- ]
871
- elif narritive=="Single-Persona: Artist":
872
- state = [
873
- (
874
- None,
875
- f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
876
- )
877
- ]
878
- elif narritive=="Multi-Persona: Objects":
879
- state = [
880
- (
881
- None,
882
- f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
883
- )
884
- ]
885
  elif language=="Chinese":
886
- if narritive == "Third-person":
887
- state = [
888
- (
889
- None,
890
- f"🤖 你好,我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供相关的信息。"
891
- )
892
- ]
893
- elif narritive == "Single-Persona: Artist":
894
- state = [
895
- (
896
- None,
897
- f"🧑‍🎨 你好,我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供我的创作背后的相关见解和想法。"
898
- )
899
- ]
900
- elif narritive == "Multi-Persona: Objects":
901
- state = [
902
- (
903
- None,
904
- f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
905
- )
906
- ]
907
 
 
908
  log_state += [(name,None)]
909
  log_state=log_state+[(paragraph,None)]
910
  log_state=log_state+[(narritive,None)]
911
  log_state=log_state+state
912
  log_state = log_state + [("%% basic information %%", None)]
 
913
 
914
  history=[]
915
- history.append({"role": "assistant", "content": paragraph+state[0][1]})
 
 
 
 
916
 
917
 
918
 
919
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
920
- original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
921
 
922
 
923
 
@@ -1056,7 +1092,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1056
  read_info = re.sub(r'[#[\]!*]','',focus_info)
1057
  read_info = emoji.replace_emoji(read_info,replace="")
1058
  print("read info",read_info)
1059
- if naritive=="Item":
1060
  parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
1061
  parsed_data = json.loads(parsed_data)
1062
 
@@ -1088,7 +1124,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1088
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1089
 
1090
 
1091
-
1092
 
1093
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1094
 
@@ -1101,8 +1137,6 @@ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, languag
1101
  'language': language
1102
  }
1103
 
1104
- naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
1105
-
1106
  naritive_value=naritive_mapping[naritive]
1107
 
1108
  if mapped_value != -1:
@@ -1139,9 +1173,17 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
1139
  history = []
1140
 
1141
  messages = history[:]
1142
-
 
1143
  if image_path:
1144
- base64_image = encode_image(image_path)
 
 
 
 
 
 
 
1145
  messages.append({
1146
  "role": "user",
1147
  "content": [
@@ -1152,7 +1194,7 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
1152
  {
1153
  "type": "image_url",
1154
  "image_url": {
1155
- "url": f"data:image/jpeg;base64,{base64_image}"
1156
  }
1157
  }
1158
  ]
@@ -1176,6 +1218,10 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
1176
  print("gpt result",result)
1177
  try:
1178
  content = result['choices'][0]['message']['content']
 
 
 
 
1179
  return content
1180
  except (KeyError, IndexError, json.JSONDecodeError) as e:
1181
  return json.dumps({"error": "Failed to parse model output", "details": str(e)})
@@ -1533,15 +1579,17 @@ async def texttospeech(text, language,gender='female'):
1533
  return None
1534
 
1535
  # give the reason of recommendation
1536
- async def associate(focus_info,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
 
1537
  rec_path=evt._data['value']['image']['path']
1538
  index=evt.index
1539
  print("rec_path",rec_path)
1540
- prompt="""
1541
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
1542
- """
1543
- prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
1544
- result=get_gpt_response(openai_api_key, rec_path, prompt)
 
1545
  print("recommend result",result)
1546
  reason = [(None, f"{result}")]
1547
  log_state = log_state + [(narritive, None)]
@@ -1550,10 +1598,10 @@ async def associate(focus_info,openai_api_key,language,autoplay,length,log_state
1550
  read_info = re.sub(r'[#[\]!*]','',result)
1551
  read_info = emoji.replace_emoji(read_info,replace="")
1552
  print("associate",read_info)
 
1553
  if autoplay:
1554
  audio_output = await texttospeech(read_info, language)
1555
- return reason,audio_output,log_state,index
1556
- return reason,None,log_state,index
1557
 
1558
  def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1559
  if task_type=="Session 1":
@@ -1648,9 +1696,9 @@ def create_ui():
1648
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1649
 
1650
  examples = [
1651
- ["test_images/1.The Ambassadors.jpg"],
1652
- ["test_images/2.Football Players.jpg"],
1653
- ["test_images/3.Along the River during the Qingming Festival.jpeg"],
1654
  # ["test_images/test3.jpg"],
1655
  # ["test_images/test4.jpg"],
1656
  # ["test_images/test5.jpg"],
@@ -1704,7 +1752,9 @@ def create_ui():
1704
  output_audio = gr.HTML(
1705
  label="Synthesised Audio", elem_classes="custom-output"
1706
  )
1707
- with gr.Row():
 
 
1708
  with gr.Column(scale=6):
1709
  with gr.Column(visible=False) as modules_not_need_gpt:
1710
 
@@ -1735,11 +1785,7 @@ def create_ui():
1735
  year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1736
  material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1737
 
1738
- with gr.Row():
1739
- gr.Examples(
1740
- examples=examples,
1741
- inputs=[example_image],
1742
- )
1743
 
1744
 
1745
  # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
@@ -1824,11 +1870,6 @@ def create_ui():
1824
  )
1825
 
1826
 
1827
-
1828
-
1829
-
1830
-
1831
-
1832
  with gr.Column(scale=4):
1833
  with gr.Column(visible=True) as module_key_input:
1834
  openai_api_key = gr.Textbox(
@@ -1892,6 +1933,12 @@ def create_ui():
1892
  with gr.Row():
1893
  with gr.Column(scale=6):
1894
  with gr.Column(visible=False) as recommend:
 
 
 
 
 
 
1895
  gallery_result = gr.Gallery(
1896
  label="Recommendations",
1897
  height="auto",
@@ -1906,19 +1953,20 @@ def create_ui():
1906
  # show_share_button=True,
1907
  # show_download_button=True
1908
  )
1909
- sort_rec=gr.Dropdown(["1", "2", "3", "4"],
1910
- value=[],
1911
- multiselect=True,
1912
- label="Score", info="Please sort the pictures according to your preference"
1913
- )
1914
 
1915
  with gr.Column(scale=4,visible=False) as reco_reasons:
1916
  recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
1917
  recommend_score = gr.Radio(
1918
- choices=[0,1,2,3,4,5],
1919
  label="Score",
1920
  interactive=True)
1921
-
 
 
 
 
 
1922
 
1923
 
1924
 
@@ -2088,14 +2136,14 @@ def create_ui():
2088
  # )
2089
  recommend_btn.click(
2090
  fn=infer,
2091
- inputs=[new_crop_save_path,image_path],
2092
- outputs=[gallery_result]
2093
  )
2094
 
2095
  gallery_result.select(
2096
  associate,
2097
- inputs=[paragraph,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
2098
- outputs=[recommend_bot,output_audio,log_state,pic_index],
2099
 
2100
 
2101
  )
@@ -2255,11 +2303,11 @@ def create_ui():
2255
 
2256
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
2257
  outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
2258
- modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
2259
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
2260
  outputs=[export,modules_need_gpt1, modules_need_gpt3,
2261
  modules_not_need_gpt,
2262
- modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
2263
 
2264
  # disable_chatGPT_button.click(init_wo_openai_api_key,
2265
  # outputs=[export,modules_need_gpt1, modules_need_gpt3,
@@ -2375,11 +2423,11 @@ def create_ui():
2375
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2376
  # paragraph,artist,gender,image_path])
2377
 
2378
- image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
2379
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2380
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2381
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2382
- paragraph,artist,gender,image_path,log_state,history_log])
2383
 
2384
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2385
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2400,11 +2448,11 @@ def create_ui():
2400
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2401
  # [chatbot, state, aux_state,output_audio])
2402
  # submit_button_text.click(lambda: "", None, chat_input)
2403
- example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
2404
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2405
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2406
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2407
- paragraph,artist,gender,image_path, log_state,history_log])
2408
 
2409
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2410
 
 
347
  return image_features
348
 
349
  @spaces.GPU
350
+ def infer(crop_image_path,full_image_path,state,language):
 
 
 
 
 
 
351
  gallery_output = []
352
+ if crop_image_path:
353
+ input_image = Image.open(crop_image_path).convert("RGB")
354
+ input_features = extract_features_siglip(input_image.convert("RGB"))
355
+ input_features = input_features.detach().cpu().numpy()
356
+ input_features = np.float32(input_features)
357
+ faiss.normalize_L2(input_features)
358
+ distances, indices = index.search(input_features, 2)
359
+ for i,v in enumerate(indices[0]):
360
+ sim = -distances[0][i]
361
+ image_url = df.iloc[v]["Link"]
362
+ img_retrieved = read_image_from_url(image_url)
363
+ gallery_output.append(img_retrieved)
364
+
365
+ input_image = Image.open(full_image_path).convert("RGB")
366
+ input_features = extract_features_siglip(input_image.convert("RGB"))
367
+ input_features = input_features.detach().cpu().numpy()
368
+ input_features = np.float32(input_features)
369
+ faiss.normalize_L2(input_features)
370
+ distances, indices = index.search(input_features, 2)
371
+ for i,v in enumerate(indices[0]):
372
+ sim = -distances[0][i]
373
+ image_url = df.iloc[v]["Link"]
374
+ img_retrieved = read_image_from_url(image_url)
375
+ gallery_output.append(img_retrieved)
376
+ if language=="English":
377
+ msg="🖼️ Please refer to the section below to see the recommended results."
378
+ else:
379
+ msg="🖼️ 请到下方查看推荐结果。"
380
+ state+=[(None,msg)]
381
+
382
+ return gallery_output,state,state
383
+ else:
384
+ input_image = Image.open(full_image_path).convert("RGB")
385
+ input_features = extract_features_siglip(input_image.convert("RGB"))
386
+ input_features = input_features.detach().cpu().numpy()
387
+ input_features = np.float32(input_features)
388
+ faiss.normalize_L2(input_features)
389
+ distances, indices = index.search(input_features, 4)
390
+ for i,v in enumerate(indices[0]):
391
+ sim = -distances[0][i]
392
+ image_url = df.iloc[v]["Link"]
393
+ img_retrieved = read_image_from_url(image_url)
394
+ gallery_output.append(img_retrieved)
395
+ if language=="English":
396
+ msg="🖼️ Please refer to the section below to see the recommended results."
397
+ else:
398
+ msg="🖼️ 请到下方查看推荐结果。"
399
+ state+=[(None,msg)]
400
+
401
+ return gallery_output,state,state
402
+
403
 
404
 
405
  ###############################################################################
 
556
  background: white !important;
557
  border: none !important;
558
  box-shadow: none !important;
559
+ font-size: 15px !important;
560
+ min-width: 6rem !important;
561
+ max-width: 10rem !important;
562
  }
563
 
564
+ .info_btn_interact {
565
+ background: rgb(242, 240, 233) !important;
566
  box-shadow: none !important;
567
+ font-size: 15px !important;
568
+ min-width: 6rem !important;
569
+ max-width: 10rem !important;
570
  }
571
 
572
  .function_button {
 
622
  ]
623
  ]
624
 
625
+ recommendation_prompt=[
626
+ '''I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:Recommendation reason: {{Recommendation based on objects in the image or Recommendation based on overall visual similarity}}
627
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1.Each bullet point should be in {language} language, with a response length of about {length} words.''',
628
+ '''
629
+ When generating the answer, you should tell others that you are the creators of the first paintings and generate the text in the tone and manner as if you are the creator of the painting.
630
+ I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
631
+ Recommendation reason: {{ As the author of the first painting, I recommend based on the object I painted OR As the author of the first painting, I recommend based on the overall similarity in appearance}}
632
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I.
633
+ Each bullet point should be in {language} language, with a response length of about {length} words.
634
+ ''',
635
+ '''
636
+ When generating answers, you should tell people that you are the object itself that was selected in the first painting, and generate text in the tone and manner in which you are the object
637
+ I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
638
+ Recommendation reason: {{As an object in the first painting, I am recommending based on myself OR As an object in the first painting, I am recommending based on the overall similarity of the first painting's appearance}}
639
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I.
640
+ Each bullet point should be in {language} language, with a response length of about {length} words.
641
+ '''
642
+
643
 
644
+
645
+ ]
646
 
647
  gpt_state = 0
648
  VOICE = "en-GB-SoniaNeural"
 
774
  global gpt_state
775
  gpt_state=1
776
  # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
777
+ return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*4+[gr.update(visible=False)]
778
  else:
779
  gpt_state=0
780
  # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
781
+ return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*5
782
 
783
  def init_wo_openai_api_key():
784
  global gpt_state
 
853
  return state, state, None, audio,log_state,history
854
 
855
 
856
+ async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
857
  print("narritive", narritive)
858
+ print("image input",image_input)
859
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
860
  image_input = image_input['background']
861
 
 
900
  print('upload_callback: add caption to chatGPT memory')
901
  new_image_path = get_new_image_name('chat_image', func_name='upload')
902
  image_input.save(new_image_path)
903
+ print("img_path",new_image_path)
904
  visual_chatgpt.current_image = new_image_path
905
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
906
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
907
+ parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
908
  print(parsed_data)
909
  parsed_data = json.loads(parsed_data.replace("'", "\""))
910
  name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
911
  gender=gender.lower()
912
  print("gender",gender)
913
 
914
+
915
 
916
 
917
  if language=="English":
918
+ if naritive_mapping[narritive]==0 :
919
+ msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
920
+
921
+ elif naritive_mapping[narritive]==1:
922
+ msg=f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
923
+
924
+ elif naritive_mapping[narritive]==2:
925
+ msg=f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
926
+
 
 
 
 
 
 
 
 
 
 
 
 
927
  elif language=="Chinese":
928
+ if naritive_mapping[narritive]==0:
929
+ msg=f"🤖 你好,我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供相关的信息。"
930
+
931
+ elif naritive_mapping[narritive]==1:
932
+ msg=f"🧑‍🎨 你好,我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供我的创作背后的相关见解和想法。"
933
+
934
+ elif naritive_mapping[narritive]==2:
935
+ msg=f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
936
+
 
 
 
 
 
 
 
 
 
 
 
 
937
 
938
+ state = [(msg,None)]
939
  log_state += [(name,None)]
940
  log_state=log_state+[(paragraph,None)]
941
  log_state=log_state+[(narritive,None)]
942
  log_state=log_state+state
943
  log_state = log_state + [("%% basic information %%", None)]
944
+ read_info=emoji.replace_emoji(msg,replace="")
945
 
946
  history=[]
947
+ history.append({"role": "assistant", "content": paragraph+msg})
948
+
949
+ audio_output = None
950
+ if autoplay:
951
+ audio_output = await texttospeech(read_info, language,gender)
952
 
953
 
954
 
955
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
956
+ original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output]
957
 
958
 
959
 
 
1092
  read_info = re.sub(r'[#[\]!*]','',focus_info)
1093
  read_info = emoji.replace_emoji(read_info,replace="")
1094
  print("read info",read_info)
1095
+ if naritive_mapping[naritive]==2:
1096
  parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
1097
  parsed_data = json.loads(parsed_data)
1098
 
 
1124
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1125
 
1126
 
1127
+ naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
1128
 
1129
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1130
 
 
1137
  'language': language
1138
  }
1139
 
 
 
1140
  naritive_value=naritive_mapping[naritive]
1141
 
1142
  if mapped_value != -1:
 
1173
  history = []
1174
 
1175
  messages = history[:]
1176
+ base64_images = []
1177
+
1178
  if image_path:
1179
+ if isinstance(image_path, list):
1180
+ for img in image_path:
1181
+ base64_image = encode_image(img)
1182
+ base64_images.append(base64_image)
1183
+ else:
1184
+ base64_image = encode_image(image_path)
1185
+ base64_images.append(base64_image)
1186
+
1187
  messages.append({
1188
  "role": "user",
1189
  "content": [
 
1194
  {
1195
  "type": "image_url",
1196
  "image_url": {
1197
+ "url": f"data:image/jpeg;base64,{base64_images}"
1198
  }
1199
  }
1200
  ]
 
1218
  print("gpt result",result)
1219
  try:
1220
  content = result['choices'][0]['message']['content']
1221
+ if content.startswith("```json"):
1222
+ content = content[7:]
1223
+ if content.endswith("```"):
1224
+ content = content[:-3]
1225
  return content
1226
  except (KeyError, IndexError, json.JSONDecodeError) as e:
1227
  return json.dumps({"error": "Failed to parse model output", "details": str(e)})
 
1579
  return None
1580
 
1581
  # give the reason of recommendation
1582
+ async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
1583
+ persona=naritive_mapping[narritive]
1584
  rec_path=evt._data['value']['image']['path']
1585
  index=evt.index
1586
  print("rec_path",rec_path)
1587
+ prompt=recommendation_prompt[persona].format(language=language,length=length)
1588
+ if new_crop:
1589
+ image_paths=[new_crop,rec_path]
1590
+ else:
1591
+ image_paths=[image_path,rec_path]
1592
+ result=get_gpt_response(openai_api_key, image_paths, prompt)
1593
  print("recommend result",result)
1594
  reason = [(None, f"{result}")]
1595
  log_state = log_state + [(narritive, None)]
 
1598
  read_info = re.sub(r'[#[\]!*]','',result)
1599
  read_info = emoji.replace_emoji(read_info,replace="")
1600
  print("associate",read_info)
1601
+ audio_output=None
1602
  if autoplay:
1603
  audio_output = await texttospeech(read_info, language)
1604
+ return reason,audio_output,log_state,index,gr.update(value=[])
 
1605
 
1606
  def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1607
  if task_type=="Session 1":
 
1696
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1697
 
1698
  examples = [
1699
+ ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
1700
+ ["test_images/2.Football Players.jpg","test_images/task2.jpg"],
1701
+ ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
1702
  # ["test_images/test3.jpg"],
1703
  # ["test_images/test4.jpg"],
1704
  # ["test_images/test5.jpg"],
 
1752
  output_audio = gr.HTML(
1753
  label="Synthesised Audio", elem_classes="custom-output"
1754
  )
1755
+ with gr.Row():
1756
+ with gr.Column(scale=1,min_width=50,visible=False) as instruct:
1757
+ task_instuction=gr.Image(type="pil", interactive=True, elem_classes="task_instruct",height=650,label=None)
1758
  with gr.Column(scale=6):
1759
  with gr.Column(visible=False) as modules_not_need_gpt:
1760
 
 
1785
  year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1786
  material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1787
 
1788
+
 
 
 
 
1789
 
1790
 
1791
  # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
 
1870
  )
1871
 
1872
 
 
 
 
 
 
1873
  with gr.Column(scale=4):
1874
  with gr.Column(visible=True) as module_key_input:
1875
  openai_api_key = gr.Textbox(
 
1933
  with gr.Row():
1934
  with gr.Column(scale=6):
1935
  with gr.Column(visible=False) as recommend:
1936
+ sort_rec=gr.Dropdown(["1", "2", "3", "4"],
1937
+ value=[],
1938
+ multiselect=True,
1939
+ label="Score", info="Please sort the pictures according to your preference"
1940
+ )
1941
+
1942
  gallery_result = gr.Gallery(
1943
  label="Recommendations",
1944
  height="auto",
 
1953
  # show_share_button=True,
1954
  # show_download_button=True
1955
  )
1956
+
 
 
 
 
1957
 
1958
  with gr.Column(scale=4,visible=False) as reco_reasons:
1959
  recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
1960
  recommend_score = gr.Radio(
1961
+ choices=[1,2,3,4,5,6,7],
1962
  label="Score",
1963
  interactive=True)
1964
+
1965
+ with gr.Row():
1966
+ gr.Examples(
1967
+ examples=examples,
1968
+ inputs=[example_image,task_instuction],
1969
+ )
1970
 
1971
 
1972
 
 
2136
  # )
2137
  recommend_btn.click(
2138
  fn=infer,
2139
+ inputs=[new_crop_save_path,image_path,state,language],
2140
+ outputs=[gallery_result,chatbot,state]
2141
  )
2142
 
2143
  gallery_result.select(
2144
  associate,
2145
+ inputs=[image_path,new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
2146
+ outputs=[recommend_bot,output_audio,log_state,pic_index,recommend_score],
2147
 
2148
 
2149
  )
 
2303
 
2304
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
2305
  outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
2306
+ modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
2307
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
2308
  outputs=[export,modules_need_gpt1, modules_need_gpt3,
2309
  modules_not_need_gpt,
2310
+ modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
2311
 
2312
  # disable_chatGPT_button.click(init_wo_openai_api_key,
2313
  # outputs=[export,modules_need_gpt1, modules_need_gpt3,
 
2423
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2424
  # paragraph,artist,gender,image_path])
2425
 
2426
+ image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,task_type],
2427
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2428
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2429
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2430
+ paragraph,artist,gender,image_path,log_state,history_log,output_audio])
2431
 
2432
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2433
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2448
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2449
  # [chatbot, state, aux_state,output_audio])
2450
  # submit_button_text.click(lambda: "", None, chat_input)
2451
+ example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,task_type],
2452
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2453
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2454
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2455
+ paragraph,artist,gender,image_path, log_state,history_log,output_audio])
2456
 
2457
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2458