Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -347,9 +347,75 @@ def extract_features_siglip(image):
|
|
347 |
return image_features
|
348 |
|
349 |
@spaces.GPU
|
350 |
-
def infer(crop_image_path,full_image_path,state,language):
|
|
|
351 |
gallery_output = []
|
352 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
input_image = Image.open(crop_image_path).convert("RGB")
|
354 |
input_features = extract_features_siglip(input_image.convert("RGB"))
|
355 |
input_features = input_features.detach().cpu().numpy()
|
@@ -601,6 +667,7 @@ focus_map = {
|
|
601 |
}
|
602 |
|
603 |
|
|
|
604 |
prompt_list = [
|
605 |
[
|
606 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
@@ -855,7 +922,6 @@ async def chat_input_callback(*args):
|
|
855 |
|
856 |
async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
|
857 |
print("narritive", narritive)
|
858 |
-
print("image input",image_input)
|
859 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
860 |
image_input = image_input['background']
|
861 |
|
@@ -1603,8 +1669,8 @@ async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,
|
|
1603 |
audio_output = await texttospeech(read_info, language)
|
1604 |
return reason,audio_output,log_state,index,gr.update(value=[])
|
1605 |
|
1606 |
-
def change_naritive(
|
1607 |
-
if
|
1608 |
return None, [], [], [[], [], []], "", None, []
|
1609 |
else:
|
1610 |
if language=="English":
|
@@ -1652,6 +1718,7 @@ def change_naritive(task_type,image_input, chatbot, state, click_state, paragrap
|
|
1652 |
)
|
1653 |
]
|
1654 |
|
|
|
1655 |
return image_input, state, state, click_state, paragraph, origin_image
|
1656 |
|
1657 |
|
@@ -1696,10 +1763,9 @@ def create_ui():
|
|
1696 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1697 |
|
1698 |
examples = [
|
1699 |
-
["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
|
1700 |
-
["test_images/2.Football Players.jpg","test_images/task2.jpg"],
|
1701 |
-
["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
|
1702 |
-
# ["test_images/test3.jpg"],
|
1703 |
# ["test_images/test4.jpg"],
|
1704 |
# ["test_images/test5.jpg"],
|
1705 |
# ["test_images/Picture5.png"],
|
@@ -1963,9 +2029,10 @@ def create_ui():
|
|
1963 |
interactive=True)
|
1964 |
|
1965 |
with gr.Row():
|
|
|
1966 |
gr.Examples(
|
1967 |
examples=examples,
|
1968 |
-
inputs=[example_image,task_instuction],
|
1969 |
)
|
1970 |
|
1971 |
|
@@ -2074,7 +2141,7 @@ def create_ui():
|
|
2074 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
2075 |
|
2076 |
with gr.Row(elem_id="top_row",visible=False) as top_row:
|
2077 |
-
|
2078 |
["Session 1","Session 2"],
|
2079 |
value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
|
2080 |
)
|
@@ -2090,6 +2157,7 @@ def create_ui():
|
|
2090 |
interactive=True,
|
2091 |
label="Generated Caption Length",
|
2092 |
)
|
|
|
2093 |
# auto_play = gr.Checkbox(
|
2094 |
# label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
|
2095 |
# )
|
@@ -2136,7 +2204,7 @@ def create_ui():
|
|
2136 |
# )
|
2137 |
recommend_btn.click(
|
2138 |
fn=infer,
|
2139 |
-
inputs=[new_crop_save_path,image_path,state,language],
|
2140 |
outputs=[gallery_result,chatbot,state]
|
2141 |
)
|
2142 |
|
@@ -2423,7 +2491,7 @@ def create_ui():
|
|
2423 |
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2424 |
# paragraph,artist,gender,image_path])
|
2425 |
|
2426 |
-
image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,
|
2427 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2428 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2429 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
@@ -2448,7 +2516,7 @@ def create_ui():
|
|
2448 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2449 |
# [chatbot, state, aux_state,output_audio])
|
2450 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2451 |
-
example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,
|
2452 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2453 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2454 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
@@ -2598,17 +2666,20 @@ def create_ui():
|
|
2598 |
|
2599 |
naritive.change(
|
2600 |
change_naritive,
|
2601 |
-
[
|
2602 |
[image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
|
2603 |
queue=False,
|
2604 |
show_progress=False
|
2605 |
|
2606 |
)
|
|
|
|
|
|
|
2607 |
|
2608 |
-
|
2609 |
-
|
2610 |
[],
|
2611 |
-
[log_state]
|
2612 |
)
|
2613 |
|
2614 |
# upvote_btn.click(
|
|
|
347 |
return image_features
|
348 |
|
349 |
@spaces.GPU
|
350 |
+
def infer(crop_image_path,full_image_path,state,language,task_type=None):
|
351 |
+
print("task type",task_type)
|
352 |
gallery_output = []
|
353 |
+
if task_type=="task 1":
|
354 |
+
gallery_output.append("recomendation_pic/1.8.jpg")
|
355 |
+
gallery_output.append("recomendation_pic/1.9.jpg")
|
356 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
357 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
358 |
+
input_features = input_features.detach().cpu().numpy()
|
359 |
+
input_features = np.float32(input_features)
|
360 |
+
faiss.normalize_L2(input_features)
|
361 |
+
distances, indices = index.search(input_features, 2)
|
362 |
+
for i,v in enumerate(indices[0]):
|
363 |
+
sim = -distances[0][i]
|
364 |
+
image_url = df.iloc[v]["Link"]
|
365 |
+
img_retrieved = read_image_from_url(image_url)
|
366 |
+
gallery_output.append(img_retrieved)
|
367 |
+
if language=="English":
|
368 |
+
msg="🖼️ Please refer to the section below to see the recommended results."
|
369 |
+
else:
|
370 |
+
msg="🖼️ 请到下方查看推荐结果。"
|
371 |
+
state+=[(None,msg)]
|
372 |
+
|
373 |
+
return gallery_output,state,state
|
374 |
+
elif task_type=="task 2":
|
375 |
+
gallery_output.append("recomendation_pic/2.8.jpg")
|
376 |
+
gallery_output.append("recomendation_pic/2.9.png")
|
377 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
378 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
379 |
+
input_features = input_features.detach().cpu().numpy()
|
380 |
+
input_features = np.float32(input_features)
|
381 |
+
faiss.normalize_L2(input_features)
|
382 |
+
distances, indices = index.search(input_features, 2)
|
383 |
+
for i,v in enumerate(indices[0]):
|
384 |
+
sim = -distances[0][i]
|
385 |
+
image_url = df.iloc[v]["Link"]
|
386 |
+
img_retrieved = read_image_from_url(image_url)
|
387 |
+
gallery_output.append(img_retrieved)
|
388 |
+
if language=="English":
|
389 |
+
msg="🖼️ Please refer to the section below to see the recommended results."
|
390 |
+
else:
|
391 |
+
msg="🖼️ 请到下方查看推荐结果。"
|
392 |
+
state+=[(None,msg)]
|
393 |
+
|
394 |
+
return gallery_output,state,state
|
395 |
+
|
396 |
+
elif task_type=="task 3":
|
397 |
+
gallery_output.append("recomendation_pic/3.8.png")
|
398 |
+
gallery_output.append("recomendation_pic/3.9.png")
|
399 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
400 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
401 |
+
input_features = input_features.detach().cpu().numpy()
|
402 |
+
input_features = np.float32(input_features)
|
403 |
+
faiss.normalize_L2(input_features)
|
404 |
+
distances, indices = index.search(input_features, 2)
|
405 |
+
for i,v in enumerate(indices[0]):
|
406 |
+
sim = -distances[0][i]
|
407 |
+
image_url = df.iloc[v]["Link"]
|
408 |
+
img_retrieved = read_image_from_url(image_url)
|
409 |
+
gallery_output.append(img_retrieved)
|
410 |
+
if language=="English":
|
411 |
+
msg="🖼️ Please refer to the section below to see the recommended results."
|
412 |
+
else:
|
413 |
+
msg="🖼️ 请到下方查看推荐结果。"
|
414 |
+
state+=[(None,msg)]
|
415 |
+
|
416 |
+
return gallery_output,state,state
|
417 |
+
|
418 |
+
elif crop_image_path:
|
419 |
input_image = Image.open(crop_image_path).convert("RGB")
|
420 |
input_features = extract_features_siglip(input_image.convert("RGB"))
|
421 |
input_features = input_features.detach().cpu().numpy()
|
|
|
667 |
}
|
668 |
|
669 |
|
670 |
+
|
671 |
prompt_list = [
|
672 |
[
|
673 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
|
|
922 |
|
923 |
async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
|
924 |
print("narritive", narritive)
|
|
|
925 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
926 |
image_input = image_input['background']
|
927 |
|
|
|
1669 |
audio_output = await texttospeech(read_info, language)
|
1670 |
return reason,audio_output,log_state,index,gr.update(value=[])
|
1671 |
|
1672 |
+
def change_naritive(session_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
|
1673 |
+
if session_type=="Session 1":
|
1674 |
return None, [], [], [[], [], []], "", None, []
|
1675 |
else:
|
1676 |
if language=="English":
|
|
|
1718 |
)
|
1719 |
]
|
1720 |
|
1721 |
+
|
1722 |
return image_input, state, state, click_state, paragraph, origin_image
|
1723 |
|
1724 |
|
|
|
1763 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1764 |
|
1765 |
examples = [
|
1766 |
+
["test_images/1.The Ambassadors.jpg","test_images/task1.jpg","task 1"],
|
1767 |
+
["test_images/2.Football Players.jpg","test_images/task2.jpg","task 2"],
|
1768 |
+
["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg","task 3"],
|
|
|
1769 |
# ["test_images/test4.jpg"],
|
1770 |
# ["test_images/test5.jpg"],
|
1771 |
# ["test_images/Picture5.png"],
|
|
|
2029 |
interactive=True)
|
2030 |
|
2031 |
with gr.Row():
|
2032 |
+
task_type = gr.Textbox(visible=False)
|
2033 |
gr.Examples(
|
2034 |
examples=examples,
|
2035 |
+
inputs=[example_image,task_instuction,task_type],
|
2036 |
)
|
2037 |
|
2038 |
|
|
|
2141 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
2142 |
|
2143 |
with gr.Row(elem_id="top_row",visible=False) as top_row:
|
2144 |
+
session_type = gr.Dropdown(
|
2145 |
["Session 1","Session 2"],
|
2146 |
value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
|
2147 |
)
|
|
|
2157 |
interactive=True,
|
2158 |
label="Generated Caption Length",
|
2159 |
)
|
2160 |
+
|
2161 |
# auto_play = gr.Checkbox(
|
2162 |
# label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
|
2163 |
# )
|
|
|
2204 |
# )
|
2205 |
recommend_btn.click(
|
2206 |
fn=infer,
|
2207 |
+
inputs=[new_crop_save_path,image_path,state,language,task_type],
|
2208 |
outputs=[gallery_result,chatbot,state]
|
2209 |
)
|
2210 |
|
|
|
2491 |
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2492 |
# paragraph,artist,gender,image_path])
|
2493 |
|
2494 |
+
image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type],
|
2495 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2496 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2497 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
|
|
2516 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2517 |
# [chatbot, state, aux_state,output_audio])
|
2518 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2519 |
+
example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type],
|
2520 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2521 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2522 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
|
|
2666 |
|
2667 |
naritive.change(
|
2668 |
change_naritive,
|
2669 |
+
[session_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
|
2670 |
[image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
|
2671 |
queue=False,
|
2672 |
show_progress=False
|
2673 |
|
2674 |
)
|
2675 |
+
def session_change():
|
2676 |
+
instruction=Image.open('test_images/task4.jpg')
|
2677 |
+
return None, [], [], [[], [], []], "", None, [],[],instruction
|
2678 |
|
2679 |
+
session_type.change(
|
2680 |
+
session_change,
|
2681 |
[],
|
2682 |
+
[image_input, chatbot, state, click_state, paragraph, origin_image,history_log,log_state,task_instuction]
|
2683 |
)
|
2684 |
|
2685 |
# upvote_btn.click(
|