Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -347,33 +347,59 @@ def extract_features_siglip(image):
|
|
347 |
return image_features
|
348 |
|
349 |
@spaces.GPU
|
350 |
-
def infer(crop_image_path,full_image_path):
|
351 |
-
input_image = Image.open(crop_image_path).convert("RGB")
|
352 |
-
input_features = extract_features_siglip(input_image.convert("RGB"))
|
353 |
-
input_features = input_features.detach().cpu().numpy()
|
354 |
-
input_features = np.float32(input_features)
|
355 |
-
faiss.normalize_L2(input_features)
|
356 |
-
distances, indices = index.search(input_features, 2)
|
357 |
gallery_output = []
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
|
379 |
###############################################################################
|
@@ -530,11 +556,17 @@ css = """
|
|
530 |
background: white !important;
|
531 |
border: none !important;
|
532 |
box-shadow: none !important;
|
|
|
|
|
|
|
533 |
}
|
534 |
|
535 |
-
info_btn_interact {
|
536 |
-
background:
|
537 |
box-shadow: none !important;
|
|
|
|
|
|
|
538 |
}
|
539 |
|
540 |
.function_button {
|
@@ -590,7 +622,27 @@ prompt_list = [
|
|
590 |
]
|
591 |
]
|
592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
|
|
|
|
|
594 |
|
595 |
gpt_state = 0
|
596 |
VOICE = "en-GB-SoniaNeural"
|
@@ -722,11 +774,11 @@ def init_openai_api_key(api_key=""):
|
|
722 |
global gpt_state
|
723 |
gpt_state=1
|
724 |
# return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
725 |
-
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*
|
726 |
else:
|
727 |
gpt_state=0
|
728 |
# return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
729 |
-
return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*
|
730 |
|
731 |
def init_wo_openai_api_key():
|
732 |
global gpt_state
|
@@ -801,9 +853,9 @@ async def chat_input_callback(*args):
|
|
801 |
return state, state, None, audio,log_state,history
|
802 |
|
803 |
|
804 |
-
def upload_callback(image_input,
|
805 |
print("narritive", narritive)
|
806 |
-
|
807 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
808 |
image_input = image_input['background']
|
809 |
|
@@ -848,76 +900,60 @@ def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_a
|
|
848 |
print('upload_callback: add caption to chatGPT memory')
|
849 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
850 |
image_input.save(new_image_path)
|
|
|
851 |
visual_chatgpt.current_image = new_image_path
|
852 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
853 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
854 |
-
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters.
|
855 |
print(parsed_data)
|
856 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
857 |
name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
|
858 |
gender=gender.lower()
|
859 |
print("gender",gender)
|
860 |
|
|
|
861 |
|
862 |
|
863 |
if language=="English":
|
864 |
-
if narritive==
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
(
|
874 |
-
None,
|
875 |
-
f"🧑🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
|
876 |
-
)
|
877 |
-
]
|
878 |
-
elif narritive=="Multi-Persona: Objects":
|
879 |
-
state = [
|
880 |
-
(
|
881 |
-
None,
|
882 |
-
f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
|
883 |
-
)
|
884 |
-
]
|
885 |
elif language=="Chinese":
|
886 |
-
if narritive
|
887 |
-
|
888 |
-
|
889 |
-
|
890 |
-
|
891 |
-
|
892 |
-
|
893 |
-
|
894 |
-
|
895 |
-
(
|
896 |
-
None,
|
897 |
-
f"🧑🎨 你好,我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供我的创作背后的相关见解和想法。"
|
898 |
-
)
|
899 |
-
]
|
900 |
-
elif narritive == "Multi-Persona: Objects":
|
901 |
-
state = [
|
902 |
-
(
|
903 |
-
None,
|
904 |
-
f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
|
905 |
-
)
|
906 |
-
]
|
907 |
|
|
|
908 |
log_state += [(name,None)]
|
909 |
log_state=log_state+[(paragraph,None)]
|
910 |
log_state=log_state+[(narritive,None)]
|
911 |
log_state=log_state+state
|
912 |
log_state = log_state + [("%% basic information %%", None)]
|
|
|
913 |
|
914 |
history=[]
|
915 |
-
history.append({"role": "assistant", "content": paragraph+
|
|
|
|
|
|
|
|
|
916 |
|
917 |
|
918 |
|
919 |
return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
|
920 |
-
original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
|
921 |
|
922 |
|
923 |
|
@@ -1056,7 +1092,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
|
|
1056 |
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
1057 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1058 |
print("read info",read_info)
|
1059 |
-
if naritive==
|
1060 |
parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
|
1061 |
parsed_data = json.loads(parsed_data)
|
1062 |
|
@@ -1088,7 +1124,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
|
|
1088 |
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
|
1089 |
|
1090 |
|
1091 |
-
|
1092 |
|
1093 |
def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
|
1094 |
|
@@ -1101,8 +1137,6 @@ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, languag
|
|
1101 |
'language': language
|
1102 |
}
|
1103 |
|
1104 |
-
naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
|
1105 |
-
|
1106 |
naritive_value=naritive_mapping[naritive]
|
1107 |
|
1108 |
if mapped_value != -1:
|
@@ -1139,9 +1173,17 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
|
|
1139 |
history = []
|
1140 |
|
1141 |
messages = history[:]
|
1142 |
-
|
|
|
1143 |
if image_path:
|
1144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1145 |
messages.append({
|
1146 |
"role": "user",
|
1147 |
"content": [
|
@@ -1152,7 +1194,7 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
|
|
1152 |
{
|
1153 |
"type": "image_url",
|
1154 |
"image_url": {
|
1155 |
-
"url": f"data:image/jpeg;base64,{
|
1156 |
}
|
1157 |
}
|
1158 |
]
|
@@ -1176,6 +1218,10 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
|
|
1176 |
print("gpt result",result)
|
1177 |
try:
|
1178 |
content = result['choices'][0]['message']['content']
|
|
|
|
|
|
|
|
|
1179 |
return content
|
1180 |
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
1181 |
return json.dumps({"error": "Failed to parse model output", "details": str(e)})
|
@@ -1533,15 +1579,17 @@ async def texttospeech(text, language,gender='female'):
|
|
1533 |
return None
|
1534 |
|
1535 |
# give the reason of recommendation
|
1536 |
-
async def associate(
|
|
|
1537 |
rec_path=evt._data['value']['image']['path']
|
1538 |
index=evt.index
|
1539 |
print("rec_path",rec_path)
|
1540 |
-
prompt=
|
1541 |
-
|
1542 |
-
|
1543 |
-
|
1544 |
-
|
|
|
1545 |
print("recommend result",result)
|
1546 |
reason = [(None, f"{result}")]
|
1547 |
log_state = log_state + [(narritive, None)]
|
@@ -1550,10 +1598,10 @@ async def associate(focus_info,openai_api_key,language,autoplay,length,log_state
|
|
1550 |
read_info = re.sub(r'[#[\]!*]','',result)
|
1551 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1552 |
print("associate",read_info)
|
|
|
1553 |
if autoplay:
|
1554 |
audio_output = await texttospeech(read_info, language)
|
1555 |
-
|
1556 |
-
return reason,None,log_state,index
|
1557 |
|
1558 |
def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
|
1559 |
if task_type=="Session 1":
|
@@ -1648,9 +1696,9 @@ def create_ui():
|
|
1648 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1649 |
|
1650 |
examples = [
|
1651 |
-
["test_images/1.The Ambassadors.jpg"],
|
1652 |
-
["test_images/2.Football Players.jpg"],
|
1653 |
-
["test_images/3.Along the River during the Qingming Festival.jpeg"],
|
1654 |
# ["test_images/test3.jpg"],
|
1655 |
# ["test_images/test4.jpg"],
|
1656 |
# ["test_images/test5.jpg"],
|
@@ -1704,7 +1752,9 @@ def create_ui():
|
|
1704 |
output_audio = gr.HTML(
|
1705 |
label="Synthesised Audio", elem_classes="custom-output"
|
1706 |
)
|
1707 |
-
with gr.Row():
|
|
|
|
|
1708 |
with gr.Column(scale=6):
|
1709 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1710 |
|
@@ -1735,11 +1785,7 @@ def create_ui():
|
|
1735 |
year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1736 |
material_label = gr.Button(value="Style: ",elem_classes="info_btn")
|
1737 |
|
1738 |
-
|
1739 |
-
gr.Examples(
|
1740 |
-
examples=examples,
|
1741 |
-
inputs=[example_image],
|
1742 |
-
)
|
1743 |
|
1744 |
|
1745 |
# example_image_click = gr.Image(type="pil", interactive=False, visible=False)
|
@@ -1824,11 +1870,6 @@ def create_ui():
|
|
1824 |
)
|
1825 |
|
1826 |
|
1827 |
-
|
1828 |
-
|
1829 |
-
|
1830 |
-
|
1831 |
-
|
1832 |
with gr.Column(scale=4):
|
1833 |
with gr.Column(visible=True) as module_key_input:
|
1834 |
openai_api_key = gr.Textbox(
|
@@ -1892,6 +1933,12 @@ def create_ui():
|
|
1892 |
with gr.Row():
|
1893 |
with gr.Column(scale=6):
|
1894 |
with gr.Column(visible=False) as recommend:
|
|
|
|
|
|
|
|
|
|
|
|
|
1895 |
gallery_result = gr.Gallery(
|
1896 |
label="Recommendations",
|
1897 |
height="auto",
|
@@ -1906,19 +1953,20 @@ def create_ui():
|
|
1906 |
# show_share_button=True,
|
1907 |
# show_download_button=True
|
1908 |
)
|
1909 |
-
|
1910 |
-
value=[],
|
1911 |
-
multiselect=True,
|
1912 |
-
label="Score", info="Please sort the pictures according to your preference"
|
1913 |
-
)
|
1914 |
|
1915 |
with gr.Column(scale=4,visible=False) as reco_reasons:
|
1916 |
recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
|
1917 |
recommend_score = gr.Radio(
|
1918 |
-
choices=[
|
1919 |
label="Score",
|
1920 |
interactive=True)
|
1921 |
-
|
|
|
|
|
|
|
|
|
|
|
1922 |
|
1923 |
|
1924 |
|
@@ -2088,14 +2136,14 @@ def create_ui():
|
|
2088 |
# )
|
2089 |
recommend_btn.click(
|
2090 |
fn=infer,
|
2091 |
-
inputs=[new_crop_save_path,image_path],
|
2092 |
-
outputs=[gallery_result]
|
2093 |
)
|
2094 |
|
2095 |
gallery_result.select(
|
2096 |
associate,
|
2097 |
-
inputs=[
|
2098 |
-
outputs=[recommend_bot,output_audio,log_state,pic_index],
|
2099 |
|
2100 |
|
2101 |
)
|
@@ -2255,11 +2303,11 @@ def create_ui():
|
|
2255 |
|
2256 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
2257 |
outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
|
2258 |
-
modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
|
2259 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
2260 |
outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
2261 |
modules_not_need_gpt,
|
2262 |
-
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
|
2263 |
|
2264 |
# disable_chatGPT_button.click(init_wo_openai_api_key,
|
2265 |
# outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
@@ -2375,11 +2423,11 @@ def create_ui():
|
|
2375 |
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2376 |
# paragraph,artist,gender,image_path])
|
2377 |
|
2378 |
-
image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
|
2379 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2380 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2381 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2382 |
-
paragraph,artist,gender,image_path,log_state,history_log])
|
2383 |
|
2384 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
|
2385 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
@@ -2400,11 +2448,11 @@ def create_ui():
|
|
2400 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2401 |
# [chatbot, state, aux_state,output_audio])
|
2402 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2403 |
-
example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
|
2404 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2405 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2406 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2407 |
-
paragraph,artist,gender,image_path, log_state,history_log])
|
2408 |
|
2409 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
2410 |
|
|
|
347 |
return image_features
|
348 |
|
349 |
@spaces.GPU
|
350 |
+
def infer(crop_image_path,full_image_path,state,language):
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
gallery_output = []
|
352 |
+
if crop_image_path:
|
353 |
+
input_image = Image.open(crop_image_path).convert("RGB")
|
354 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
355 |
+
input_features = input_features.detach().cpu().numpy()
|
356 |
+
input_features = np.float32(input_features)
|
357 |
+
faiss.normalize_L2(input_features)
|
358 |
+
distances, indices = index.search(input_features, 2)
|
359 |
+
for i,v in enumerate(indices[0]):
|
360 |
+
sim = -distances[0][i]
|
361 |
+
image_url = df.iloc[v]["Link"]
|
362 |
+
img_retrieved = read_image_from_url(image_url)
|
363 |
+
gallery_output.append(img_retrieved)
|
364 |
+
|
365 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
366 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
367 |
+
input_features = input_features.detach().cpu().numpy()
|
368 |
+
input_features = np.float32(input_features)
|
369 |
+
faiss.normalize_L2(input_features)
|
370 |
+
distances, indices = index.search(input_features, 2)
|
371 |
+
for i,v in enumerate(indices[0]):
|
372 |
+
sim = -distances[0][i]
|
373 |
+
image_url = df.iloc[v]["Link"]
|
374 |
+
img_retrieved = read_image_from_url(image_url)
|
375 |
+
gallery_output.append(img_retrieved)
|
376 |
+
if language=="English":
|
377 |
+
msg="🖼️ Please refer to the section below to see the recommended results."
|
378 |
+
else:
|
379 |
+
msg="🖼️ 请到下方查看推荐结果。"
|
380 |
+
state+=[(None,msg)]
|
381 |
+
|
382 |
+
return gallery_output,state,state
|
383 |
+
else:
|
384 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
385 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
386 |
+
input_features = input_features.detach().cpu().numpy()
|
387 |
+
input_features = np.float32(input_features)
|
388 |
+
faiss.normalize_L2(input_features)
|
389 |
+
distances, indices = index.search(input_features, 4)
|
390 |
+
for i,v in enumerate(indices[0]):
|
391 |
+
sim = -distances[0][i]
|
392 |
+
image_url = df.iloc[v]["Link"]
|
393 |
+
img_retrieved = read_image_from_url(image_url)
|
394 |
+
gallery_output.append(img_retrieved)
|
395 |
+
if language=="English":
|
396 |
+
msg="🖼️ Please refer to the section below to see the recommended results."
|
397 |
+
else:
|
398 |
+
msg="🖼️ 请到下方查看推荐结果。"
|
399 |
+
state+=[(None,msg)]
|
400 |
+
|
401 |
+
return gallery_output,state,state
|
402 |
+
|
403 |
|
404 |
|
405 |
###############################################################################
|
|
|
556 |
background: white !important;
|
557 |
border: none !important;
|
558 |
box-shadow: none !important;
|
559 |
+
font-size: 15px !important;
|
560 |
+
min-width: 6rem !important;
|
561 |
+
max-width: 10rem !important;
|
562 |
}
|
563 |
|
564 |
+
.info_btn_interact {
|
565 |
+
background: rgb(242, 240, 233) !important;
|
566 |
box-shadow: none !important;
|
567 |
+
font-size: 15px !important;
|
568 |
+
min-width: 6rem !important;
|
569 |
+
max-width: 10rem !important;
|
570 |
}
|
571 |
|
572 |
.function_button {
|
|
|
622 |
]
|
623 |
]
|
624 |
|
625 |
+
recommendation_prompt=[
|
626 |
+
'''I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:Recommendation reason: {{Recommendation based on objects in the image or Recommendation based on overall visual similarity}}
|
627 |
+
Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1.Each bullet point should be in {language} language, with a response length of about {length} words.''',
|
628 |
+
'''
|
629 |
+
When generating the answer, you should tell others that you are the creators of the first paintings and generate the text in the tone and manner as if you are the creator of the painting.
|
630 |
+
I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
|
631 |
+
Recommendation reason: {{ As the author of the first painting, I recommend based on the object I painted OR As the author of the first painting, I recommend based on the overall similarity in appearance}}
|
632 |
+
Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I.
|
633 |
+
Each bullet point should be in {language} language, with a response length of about {length} words.
|
634 |
+
''',
|
635 |
+
'''
|
636 |
+
When generating answers, you should tell people that you are the object itself that was selected in the first painting, and generate text in the tone and manner in which you are the object
|
637 |
+
I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
|
638 |
+
Recommendation reason: {{As an object in the first painting, I am recommending based on myself OR As an object in the first painting, I am recommending based on the overall similarity of the first painting's appearance}}
|
639 |
+
Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I.
|
640 |
+
Each bullet point should be in {language} language, with a response length of about {length} words.
|
641 |
+
'''
|
642 |
+
|
643 |
|
644 |
+
|
645 |
+
]
|
646 |
|
647 |
gpt_state = 0
|
648 |
VOICE = "en-GB-SoniaNeural"
|
|
|
774 |
global gpt_state
|
775 |
gpt_state=1
|
776 |
# return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
777 |
+
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*4+[gr.update(visible=False)]
|
778 |
else:
|
779 |
gpt_state=0
|
780 |
# return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
781 |
+
return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*5
|
782 |
|
783 |
def init_wo_openai_api_key():
|
784 |
global gpt_state
|
|
|
853 |
return state, state, None, audio,log_state,history
|
854 |
|
855 |
|
856 |
+
async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
|
857 |
print("narritive", narritive)
|
858 |
+
print("image input",image_input)
|
859 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
860 |
image_input = image_input['background']
|
861 |
|
|
|
900 |
print('upload_callback: add caption to chatGPT memory')
|
901 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
902 |
image_input.save(new_image_path)
|
903 |
+
print("img_path",new_image_path)
|
904 |
visual_chatgpt.current_image = new_image_path
|
905 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
906 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
907 |
+
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
|
908 |
print(parsed_data)
|
909 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
910 |
name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
|
911 |
gender=gender.lower()
|
912 |
print("gender",gender)
|
913 |
|
914 |
+
|
915 |
|
916 |
|
917 |
if language=="English":
|
918 |
+
if naritive_mapping[narritive]==0 :
|
919 |
+
msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
|
920 |
+
|
921 |
+
elif naritive_mapping[narritive]==1:
|
922 |
+
msg=f"🧑🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
|
923 |
+
|
924 |
+
elif naritive_mapping[narritive]==2:
|
925 |
+
msg=f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
|
926 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
927 |
elif language=="Chinese":
|
928 |
+
if naritive_mapping[narritive]==0:
|
929 |
+
msg=f"🤖 你好,我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供相关的信息。"
|
930 |
+
|
931 |
+
elif naritive_mapping[narritive]==1:
|
932 |
+
msg=f"🧑🎨 你好,我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会为你提供我的创作背后的相关见解和想法。"
|
933 |
+
|
934 |
+
elif naritive_mapping[narritive]==2:
|
935 |
+
msg=f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
|
936 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
937 |
|
938 |
+
state = [(msg,None)]
|
939 |
log_state += [(name,None)]
|
940 |
log_state=log_state+[(paragraph,None)]
|
941 |
log_state=log_state+[(narritive,None)]
|
942 |
log_state=log_state+state
|
943 |
log_state = log_state + [("%% basic information %%", None)]
|
944 |
+
read_info=emoji.replace_emoji(msg,replace="")
|
945 |
|
946 |
history=[]
|
947 |
+
history.append({"role": "assistant", "content": paragraph+msg})
|
948 |
+
|
949 |
+
audio_output = None
|
950 |
+
if autoplay:
|
951 |
+
audio_output = await texttospeech(read_info, language,gender)
|
952 |
|
953 |
|
954 |
|
955 |
return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
|
956 |
+
original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output]
|
957 |
|
958 |
|
959 |
|
|
|
1092 |
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
1093 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1094 |
print("read info",read_info)
|
1095 |
+
if naritive_mapping[naritive]==2:
|
1096 |
parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
|
1097 |
parsed_data = json.loads(parsed_data)
|
1098 |
|
|
|
1124 |
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
|
1125 |
|
1126 |
|
1127 |
+
naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
|
1128 |
|
1129 |
def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
|
1130 |
|
|
|
1137 |
'language': language
|
1138 |
}
|
1139 |
|
|
|
|
|
1140 |
naritive_value=naritive_mapping[naritive]
|
1141 |
|
1142 |
if mapped_value != -1:
|
|
|
1173 |
history = []
|
1174 |
|
1175 |
messages = history[:]
|
1176 |
+
base64_images = []
|
1177 |
+
|
1178 |
if image_path:
|
1179 |
+
if isinstance(image_path, list):
|
1180 |
+
for img in image_path:
|
1181 |
+
base64_image = encode_image(img)
|
1182 |
+
base64_images.append(base64_image)
|
1183 |
+
else:
|
1184 |
+
base64_image = encode_image(image_path)
|
1185 |
+
base64_images.append(base64_image)
|
1186 |
+
|
1187 |
messages.append({
|
1188 |
"role": "user",
|
1189 |
"content": [
|
|
|
1194 |
{
|
1195 |
"type": "image_url",
|
1196 |
"image_url": {
|
1197 |
+
"url": f"data:image/jpeg;base64,{base64_images}"
|
1198 |
}
|
1199 |
}
|
1200 |
]
|
|
|
1218 |
print("gpt result",result)
|
1219 |
try:
|
1220 |
content = result['choices'][0]['message']['content']
|
1221 |
+
if content.startswith("```json"):
|
1222 |
+
content = content[7:]
|
1223 |
+
if content.endswith("```"):
|
1224 |
+
content = content[:-3]
|
1225 |
return content
|
1226 |
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
1227 |
return json.dumps({"error": "Failed to parse model output", "details": str(e)})
|
|
|
1579 |
return None
|
1580 |
|
1581 |
# give the reason of recommendation
|
1582 |
+
async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
|
1583 |
+
persona=naritive_mapping[narritive]
|
1584 |
rec_path=evt._data['value']['image']['path']
|
1585 |
index=evt.index
|
1586 |
print("rec_path",rec_path)
|
1587 |
+
prompt=recommendation_prompt[persona].format(language=language,length=length)
|
1588 |
+
if new_crop:
|
1589 |
+
image_paths=[new_crop,rec_path]
|
1590 |
+
else:
|
1591 |
+
image_paths=[image_path,rec_path]
|
1592 |
+
result=get_gpt_response(openai_api_key, image_paths, prompt)
|
1593 |
print("recommend result",result)
|
1594 |
reason = [(None, f"{result}")]
|
1595 |
log_state = log_state + [(narritive, None)]
|
|
|
1598 |
read_info = re.sub(r'[#[\]!*]','',result)
|
1599 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1600 |
print("associate",read_info)
|
1601 |
+
audio_output=None
|
1602 |
if autoplay:
|
1603 |
audio_output = await texttospeech(read_info, language)
|
1604 |
+
return reason,audio_output,log_state,index,gr.update(value=[])
|
|
|
1605 |
|
1606 |
def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
|
1607 |
if task_type=="Session 1":
|
|
|
1696 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1697 |
|
1698 |
examples = [
|
1699 |
+
["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
|
1700 |
+
["test_images/2.Football Players.jpg","test_images/task2.jpg"],
|
1701 |
+
["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
|
1702 |
# ["test_images/test3.jpg"],
|
1703 |
# ["test_images/test4.jpg"],
|
1704 |
# ["test_images/test5.jpg"],
|
|
|
1752 |
output_audio = gr.HTML(
|
1753 |
label="Synthesised Audio", elem_classes="custom-output"
|
1754 |
)
|
1755 |
+
with gr.Row():
|
1756 |
+
with gr.Column(scale=1,min_width=50,visible=False) as instruct:
|
1757 |
+
task_instuction=gr.Image(type="pil", interactive=True, elem_classes="task_instruct",height=650,label=None)
|
1758 |
with gr.Column(scale=6):
|
1759 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1760 |
|
|
|
1785 |
year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1786 |
material_label = gr.Button(value="Style: ",elem_classes="info_btn")
|
1787 |
|
1788 |
+
|
|
|
|
|
|
|
|
|
1789 |
|
1790 |
|
1791 |
# example_image_click = gr.Image(type="pil", interactive=False, visible=False)
|
|
|
1870 |
)
|
1871 |
|
1872 |
|
|
|
|
|
|
|
|
|
|
|
1873 |
with gr.Column(scale=4):
|
1874 |
with gr.Column(visible=True) as module_key_input:
|
1875 |
openai_api_key = gr.Textbox(
|
|
|
1933 |
with gr.Row():
|
1934 |
with gr.Column(scale=6):
|
1935 |
with gr.Column(visible=False) as recommend:
|
1936 |
+
sort_rec=gr.Dropdown(["1", "2", "3", "4"],
|
1937 |
+
value=[],
|
1938 |
+
multiselect=True,
|
1939 |
+
label="Score", info="Please sort the pictures according to your preference"
|
1940 |
+
)
|
1941 |
+
|
1942 |
gallery_result = gr.Gallery(
|
1943 |
label="Recommendations",
|
1944 |
height="auto",
|
|
|
1953 |
# show_share_button=True,
|
1954 |
# show_download_button=True
|
1955 |
)
|
1956 |
+
|
|
|
|
|
|
|
|
|
1957 |
|
1958 |
with gr.Column(scale=4,visible=False) as reco_reasons:
|
1959 |
recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
|
1960 |
recommend_score = gr.Radio(
|
1961 |
+
choices=[1,2,3,4,5,6,7],
|
1962 |
label="Score",
|
1963 |
interactive=True)
|
1964 |
+
|
1965 |
+
with gr.Row():
|
1966 |
+
gr.Examples(
|
1967 |
+
examples=examples,
|
1968 |
+
inputs=[example_image,task_instuction],
|
1969 |
+
)
|
1970 |
|
1971 |
|
1972 |
|
|
|
2136 |
# )
|
2137 |
recommend_btn.click(
|
2138 |
fn=infer,
|
2139 |
+
inputs=[new_crop_save_path,image_path,state,language],
|
2140 |
+
outputs=[gallery_result,chatbot,state]
|
2141 |
)
|
2142 |
|
2143 |
gallery_result.select(
|
2144 |
associate,
|
2145 |
+
inputs=[image_path,new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
|
2146 |
+
outputs=[recommend_bot,output_audio,log_state,pic_index,recommend_score],
|
2147 |
|
2148 |
|
2149 |
)
|
|
|
2303 |
|
2304 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
2305 |
outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
|
2306 |
+
modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
|
2307 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
2308 |
outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
2309 |
modules_not_need_gpt,
|
2310 |
+
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
|
2311 |
|
2312 |
# disable_chatGPT_button.click(init_wo_openai_api_key,
|
2313 |
# outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
|
|
2423 |
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2424 |
# paragraph,artist,gender,image_path])
|
2425 |
|
2426 |
+
image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,task_type],
|
2427 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2428 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2429 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2430 |
+
paragraph,artist,gender,image_path,log_state,history_log,output_audio])
|
2431 |
|
2432 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
|
2433 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
|
|
2448 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2449 |
# [chatbot, state, aux_state,output_audio])
|
2450 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2451 |
+
example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,task_type],
|
2452 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2453 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2454 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2455 |
+
paragraph,artist,gender,image_path, log_state,history_log,output_audio])
|
2456 |
|
2457 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
2458 |
|