Niki Zhang commited on
Commit
4da0523
·
verified ·
1 Parent(s): 06cd0b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -79
app.py CHANGED
@@ -512,10 +512,6 @@ css = """
512
  }
513
 
514
 
515
- .image_upload {
516
- height: 650px;
517
- }
518
-
519
  .info_btn {
520
  background: white !important;
521
  border: none !important;
@@ -569,23 +565,22 @@ prompt_list = [
569
  '''
570
  prompt_list = [
571
  [
572
-
573
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
574
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
575
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
576
- 'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
577
  ],
578
  [
579
- 'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
580
- 'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
581
- 'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
582
- 'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
583
  ],
584
  [
585
- 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
586
- 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
587
- 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
588
- 'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
589
  ]
590
  ]
591
 
@@ -770,10 +765,14 @@ def update_click_state(click_state, caption, click_mode):
770
  raise NotImplementedError
771
 
772
  async def chat_input_callback(*args):
773
- visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender = args
774
  message = chat_input["text"]
 
 
775
  if visual_chatgpt is not None:
776
- state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
 
 
777
  last_text, last_response = state[-1]
778
  print("last response",last_response)
779
  if autoplay==False:
@@ -886,7 +885,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
886
 
887
 
888
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
889
- original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender]
890
 
891
 
892
 
@@ -965,7 +964,7 @@ query_focus = {
965
  "D": "Provide a description of the item.",
966
  "DA": "Provide a description and analysis of the item.",
967
  "DAI": "Provide a description, analysis, and interpretation of the item.",
968
- "DDA": "Evaluate the item."
969
  }
970
 
971
 
@@ -1029,18 +1028,18 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1029
  audio_output = await texttospeech(read_info, language, autoplay,gender)
1030
  print("done")
1031
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1032
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,gender,focus_info
1033
 
1034
  except Exception as e:
1035
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1036
  print(f"Error during TTS prediction: {str(e)}")
1037
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
1038
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,gender,focus_info
1039
 
1040
  else:
1041
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1042
  print(f"Error during TTS prediction: {str(e)}")
1043
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,focus_info
1044
 
1045
 
1046
 
@@ -1090,30 +1089,39 @@ def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
1090
  "Content-Type": "application/json",
1091
  "Authorization": f"Bearer {api_key}"
1092
  }
1093
-
 
1094
  if image_path:
1095
- base64_image = encode_image(image_path)
1096
- payload = {
1097
- "model": "gpt-4o",
1098
- "messages": [
1099
- {
1100
- "role": "user",
1101
- "content": [
1102
- {
1103
- "type": "text",
1104
- "text": prompt
1105
- },
1106
- {
1107
- "type": "image_url",
1108
- "image_url": {
1109
- "url": f"data:image/jpeg;base64,{base64_image}"
 
 
 
 
 
 
 
 
 
1110
  }
1111
- }
1112
- ]
1113
- }
1114
- ],
1115
- "max_tokens": 300
1116
- }
1117
  else:
1118
  payload = {
1119
  "model": "gpt-4o",
@@ -1494,21 +1502,13 @@ async def texttospeech(text, language, autoplay,gender='female'):
1494
  print(f"Error in texttospeech: {e}")
1495
  return None
1496
 
1497
- async def associate(focus_info,openai_api_key,language,state,autoplay,evt: gr.SelectData):
1498
  rec_path=evt._data['value']['image']['path']
1499
  print("rec_path",rec_path)
1500
  prompt="""
1501
- The information and image I gave you are 2 different paintings. Please analyze the relationship between the image and the information {focus_info}. Discuss their similarities and differences in terms of style, themes, colors, and any other relevant aspects. Provide a detailed analysis that highlights how the information fits into or contrasts with the recommended painting. Consider the following points in your analysis:
1502
- - Artistic style and techniques
1503
- - Themes and subjects
1504
- - Color palettes and compositions
1505
- - Historical and cultural contexts
1506
- - Symbolism and meanings
1507
-
1508
- Based on your analysis, provide insights into how the information enhances or contrasts with the recommended painting, and suggest any interesting interpretations or observations. Return your response in {language}
1509
-
1510
  """
1511
- prompt=prompt.format(focus_info=focus_info,language=language)
1512
  result=get_gpt_response(openai_api_key, rec_path, prompt)
1513
  state = state + [(None, f"{result}")]
1514
  read_info = re.sub(r'[#[\]!*]','',result)
@@ -1559,11 +1559,11 @@ def create_ui():
1559
 
1560
  examples = [
1561
  ["test_images/ambass.jpg"],
1562
- ["test_images/test1.png"],
1563
- ["test_images/test2.png"],
1564
- ["test_images/test3.png"],
1565
- ["test_images/test4.png"],
1566
- ["test_images/test5.png"],
1567
  ["test_images/Picture5.png"],
1568
 
1569
  ]
@@ -1597,7 +1597,7 @@ def create_ui():
1597
  point_prompt = gr.State("Positive")
1598
  log_list=gr.State([])
1599
  gender=gr.State('female')
1600
- focus_info=gr.State('')
1601
  # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
1602
  # with gr.Column(scale=0.5):
1603
  # # gr.Markdown("Left side content")
@@ -1648,7 +1648,7 @@ def create_ui():
1648
  with gr.Column(scale=6):
1649
  with gr.Column(visible=False) as modules_not_need_gpt:
1650
  with gr.Tab("Base(GPT Power)") as base_tab:
1651
- image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
1652
  with gr.Row():
1653
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
1654
  artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
@@ -1656,7 +1656,7 @@ def create_ui():
1656
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1657
 
1658
  with gr.Tab("Base2") as base_tab2:
1659
- image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
1660
  with gr.Row():
1661
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
1662
  artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
@@ -1666,7 +1666,7 @@ def create_ui():
1666
  with gr.Tab("Click") as click_tab:
1667
  with gr.Row():
1668
  with gr.Column(scale=10,min_width=600):
1669
- image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
1670
  example_image = gr.Image(type="pil", interactive=False, visible=False)
1671
  with gr.Row():
1672
  name_label = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1977,7 +1977,7 @@ def create_ui():
1977
 
1978
  gallery_result.select(
1979
  associate,
1980
- inputs=[focus_info,openai_api_key,language,state,auto_play],
1981
  outputs=[chatbot,state,output_audio],
1982
 
1983
 
@@ -2243,19 +2243,19 @@ def create_ui():
2243
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2244
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2245
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2246
- paragraph,artist,gender])
2247
 
2248
- # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key],
2249
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2250
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2251
- # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2252
- # paragraph,artist])
2253
 
2254
- # image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key],
2255
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2256
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2257
- # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2258
- # paragraph,artist])
2259
 
2260
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2261
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2269,7 +2269,7 @@ def create_ui():
2269
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2270
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2271
  # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2272
- chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender],
2273
  [chatbot, state, aux_state,output_audio])
2274
  # chat_input.submit(lambda: "", None, chat_input)
2275
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
@@ -2280,7 +2280,7 @@ def create_ui():
2280
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2281
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2282
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2283
- paragraph,artist,gender])
2284
 
2285
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2286
 
@@ -2331,7 +2331,7 @@ def create_ui():
2331
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
2332
  ],
2333
  outputs=[
2334
- chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,focus_info
2335
  ],
2336
  show_progress=True,
2337
  queue=True
 
512
  }
513
 
514
 
 
 
 
 
515
  .info_btn {
516
  background: white !important;
517
  border: none !important;
 
565
  '''
566
  prompt_list = [
567
  [
 
568
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
569
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
570
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
571
+ 'Wiki_caption: {Wiki_caption},You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.'
572
  ],
573
  [
574
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
575
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
576
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis, and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
577
+ 'Wiki_caption: {Wiki_caption},You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
578
  ],
579
  [
580
+ 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
581
+ 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
582
+ 'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
583
+ 'Wiki_caption: {Wiki_caption},You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.'
584
  ]
585
  ]
586
 
 
765
  raise NotImplementedError
766
 
767
  async def chat_input_callback(*args):
768
+ visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input = args
769
  message = chat_input["text"]
770
+ prompt="Please help me answer the question with this painting."
771
+ state = state + [(message,None)]
772
  if visual_chatgpt is not None:
773
+ result=get_gpt_response(api_key, image_input,prompt+message)
774
+ state = state + [(None, result)]
775
+ # state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
776
  last_text, last_response = state[-1]
777
  print("last response",last_response)
778
  if autoplay==False:
 
885
 
886
 
887
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
888
+ original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path]
889
 
890
 
891
 
 
964
  "D": "Provide a description of the item.",
965
  "DA": "Provide a description and analysis of the item.",
966
  "DAI": "Provide a description, analysis, and interpretation of the item.",
967
+ "Judge": "Evaluate the item."
968
  }
969
 
970
 
 
1028
  audio_output = await texttospeech(read_info, language, autoplay,gender)
1029
  print("done")
1030
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1031
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
1032
 
1033
  except Exception as e:
1034
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1035
  print(f"Error during TTS prediction: {str(e)}")
1036
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
1037
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
1038
 
1039
  else:
1040
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
1041
  print(f"Error during TTS prediction: {str(e)}")
1042
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None
1043
 
1044
 
1045
 
 
1089
  "Content-Type": "application/json",
1090
  "Authorization": f"Bearer {api_key}"
1091
  }
1092
+ base64_images=[]
1093
+
1094
  if image_path:
1095
+ if isinstance(image_path, list):
1096
+
1097
+ for img in image_path:
1098
+ base64_image = encode_image(img)
1099
+ base64_images.append(base64_image)
1100
+ else:
1101
+ base64_image = encode_image(image_path)
1102
+ base64_images.append(base64_image)
1103
+
1104
+ payload = {
1105
+ "model": "gpt-4o",
1106
+ "messages": [
1107
+ {
1108
+ "role": "user",
1109
+ "content": [
1110
+ {
1111
+ "type": "text",
1112
+ "text": prompt
1113
+ },
1114
+ {
1115
+ "type": "image_url",
1116
+ "image_url": {
1117
+ "url": f"data:image/jpeg;base64,{base64_images}"
1118
+ }
1119
  }
1120
+ ]
1121
+ }
1122
+ ],
1123
+ "max_tokens": 300
1124
+ }
 
1125
  else:
1126
  payload = {
1127
  "model": "gpt-4o",
 
1502
  print(f"Error in texttospeech: {e}")
1503
  return None
1504
 
1505
+ async def associate(focus_info,openai_api_key,language,state,autoplay,length, evt: gr.SelectData):
1506
  rec_path=evt._data['value']['image']['path']
1507
  print("rec_path",rec_path)
1508
  prompt="""
1509
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
 
 
 
 
 
 
 
 
1510
  """
1511
+ prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
1512
  result=get_gpt_response(openai_api_key, rec_path, prompt)
1513
  state = state + [(None, f"{result}")]
1514
  read_info = re.sub(r'[#[\]!*]','',result)
 
1559
 
1560
  examples = [
1561
  ["test_images/ambass.jpg"],
1562
+ ["test_images/test1.jpg"],
1563
+ ["test_images/test2.jpg"],
1564
+ ["test_images/test3.jpg"],
1565
+ ["test_images/test4.jpg"],
1566
+ ["test_images/test5.jpg"],
1567
  ["test_images/Picture5.png"],
1568
 
1569
  ]
 
1597
  point_prompt = gr.State("Positive")
1598
  log_list=gr.State([])
1599
  gender=gr.State('female')
1600
+ image_path=gr.State('')
1601
  # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
1602
  # with gr.Column(scale=0.5):
1603
  # # gr.Markdown("Left side content")
 
1648
  with gr.Column(scale=6):
1649
  with gr.Column(visible=False) as modules_not_need_gpt:
1650
  with gr.Tab("Base(GPT Power)") as base_tab:
1651
+ image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1652
  with gr.Row():
1653
  name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
1654
  artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
 
1656
  material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
1657
 
1658
  with gr.Tab("Base2") as base_tab2:
1659
+ image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1660
  with gr.Row():
1661
  name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
1662
  artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
 
1666
  with gr.Tab("Click") as click_tab:
1667
  with gr.Row():
1668
  with gr.Column(scale=10,min_width=600):
1669
+ image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1670
  example_image = gr.Image(type="pil", interactive=False, visible=False)
1671
  with gr.Row():
1672
  name_label = gr.Button(value="Name: ",elem_classes="info_btn")
 
1977
 
1978
  gallery_result.select(
1979
  associate,
1980
+ inputs=[paragraph,openai_api_key,language,state,auto_play,length],
1981
  outputs=[chatbot,state,output_audio],
1982
 
1983
 
 
2243
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2244
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2245
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2246
+ paragraph,artist,gender,image_path])
2247
 
2248
+ image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
2249
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2250
+ image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2251
+ name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2252
+ paragraph,artist,gender,image_path])
2253
 
2254
+ image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
2255
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2256
+ image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2257
+ name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2258
+ paragraph,artist,gender,image_path])
2259
 
2260
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2261
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2269
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2270
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2271
  # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2272
+ chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path],
2273
  [chatbot, state, aux_state,output_audio])
2274
  # chat_input.submit(lambda: "", None, chat_input)
2275
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
 
2280
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2281
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2282
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2283
+ paragraph,artist,gender,image_path])
2284
 
2285
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2286
 
 
2331
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
2332
  ],
2333
  outputs=[
2334
+ chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
2335
  ],
2336
  show_progress=True,
2337
  queue=True