Niki Zhang commited on
Commit
c0be077
·
verified ·
1 Parent(s): abb5985

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -18
app.py CHANGED
@@ -347,9 +347,75 @@ def extract_features_siglip(image):
347
  return image_features
348
 
349
  @spaces.GPU
350
- def infer(crop_image_path,full_image_path,state,language):
 
351
  gallery_output = []
352
- if crop_image_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  input_image = Image.open(crop_image_path).convert("RGB")
354
  input_features = extract_features_siglip(input_image.convert("RGB"))
355
  input_features = input_features.detach().cpu().numpy()
@@ -601,6 +667,7 @@ focus_map = {
601
  }
602
 
603
 
 
604
  prompt_list = [
605
  [
606
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
@@ -855,7 +922,6 @@ async def chat_input_callback(*args):
855
 
856
  async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
857
  print("narritive", narritive)
858
- print("image input",image_input)
859
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
860
  image_input = image_input['background']
861
 
@@ -1603,8 +1669,8 @@ async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,
1603
  audio_output = await texttospeech(read_info, language)
1604
  return reason,audio_output,log_state,index,gr.update(value=[])
1605
 
1606
- def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1607
- if task_type=="Session 1":
1608
  return None, [], [], [[], [], []], "", None, []
1609
  else:
1610
  if language=="English":
@@ -1652,6 +1718,7 @@ def change_naritive(task_type,image_input, chatbot, state, click_state, paragrap
1652
  )
1653
  ]
1654
 
 
1655
  return image_input, state, state, click_state, paragraph, origin_image
1656
 
1657
 
@@ -1696,10 +1763,9 @@ def create_ui():
1696
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1697
 
1698
  examples = [
1699
- ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
1700
- ["test_images/2.Football Players.jpg","test_images/task2.jpg"],
1701
- ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
1702
- # ["test_images/test3.jpg"],
1703
  # ["test_images/test4.jpg"],
1704
  # ["test_images/test5.jpg"],
1705
  # ["test_images/Picture5.png"],
@@ -1963,9 +2029,10 @@ def create_ui():
1963
  interactive=True)
1964
 
1965
  with gr.Row():
 
1966
  gr.Examples(
1967
  examples=examples,
1968
- inputs=[example_image,task_instuction],
1969
  )
1970
 
1971
 
@@ -2074,7 +2141,7 @@ def create_ui():
2074
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
2075
 
2076
  with gr.Row(elem_id="top_row",visible=False) as top_row:
2077
- task_type = gr.Dropdown(
2078
  ["Session 1","Session 2"],
2079
  value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
2080
  )
@@ -2090,6 +2157,7 @@ def create_ui():
2090
  interactive=True,
2091
  label="Generated Caption Length",
2092
  )
 
2093
  # auto_play = gr.Checkbox(
2094
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2095
  # )
@@ -2136,7 +2204,7 @@ def create_ui():
2136
  # )
2137
  recommend_btn.click(
2138
  fn=infer,
2139
- inputs=[new_crop_save_path,image_path,state,language],
2140
  outputs=[gallery_result,chatbot,state]
2141
  )
2142
 
@@ -2423,7 +2491,7 @@ def create_ui():
2423
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2424
  # paragraph,artist,gender,image_path])
2425
 
2426
- image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,task_type],
2427
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2428
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2429
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
@@ -2448,7 +2516,7 @@ def create_ui():
2448
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2449
  # [chatbot, state, aux_state,output_audio])
2450
  # submit_button_text.click(lambda: "", None, chat_input)
2451
- example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,task_type],
2452
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2453
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2454
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
@@ -2598,17 +2666,20 @@ def create_ui():
2598
 
2599
  naritive.change(
2600
  change_naritive,
2601
- [task_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
2602
  [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
2603
  queue=False,
2604
  show_progress=False
2605
 
2606
  )
 
 
 
2607
 
2608
- task_type.change(
2609
- lambda: ([]),
2610
  [],
2611
- [log_state]
2612
  )
2613
 
2614
  # upvote_btn.click(
 
347
  return image_features
348
 
349
  @spaces.GPU
350
+ def infer(crop_image_path,full_image_path,state,language,task_type=None):
351
+ print("task type",task_type)
352
  gallery_output = []
353
+ if task_type=="task 1":
354
+ gallery_output.append("recomendation_pic/1.8.jpg")
355
+ gallery_output.append("recomendation_pic/1.9.jpg")
356
+ input_image = Image.open(full_image_path).convert("RGB")
357
+ input_features = extract_features_siglip(input_image.convert("RGB"))
358
+ input_features = input_features.detach().cpu().numpy()
359
+ input_features = np.float32(input_features)
360
+ faiss.normalize_L2(input_features)
361
+ distances, indices = index.search(input_features, 2)
362
+ for i,v in enumerate(indices[0]):
363
+ sim = -distances[0][i]
364
+ image_url = df.iloc[v]["Link"]
365
+ img_retrieved = read_image_from_url(image_url)
366
+ gallery_output.append(img_retrieved)
367
+ if language=="English":
368
+ msg="🖼️ Please refer to the section below to see the recommended results."
369
+ else:
370
+ msg="🖼️ 请到下方查看推荐结果。"
371
+ state+=[(None,msg)]
372
+
373
+ return gallery_output,state,state
374
+ elif task_type=="task 2":
375
+ gallery_output.append("recomendation_pic/2.8.jpg")
376
+ gallery_output.append("recomendation_pic/2.9.png")
377
+ input_image = Image.open(full_image_path).convert("RGB")
378
+ input_features = extract_features_siglip(input_image.convert("RGB"))
379
+ input_features = input_features.detach().cpu().numpy()
380
+ input_features = np.float32(input_features)
381
+ faiss.normalize_L2(input_features)
382
+ distances, indices = index.search(input_features, 2)
383
+ for i,v in enumerate(indices[0]):
384
+ sim = -distances[0][i]
385
+ image_url = df.iloc[v]["Link"]
386
+ img_retrieved = read_image_from_url(image_url)
387
+ gallery_output.append(img_retrieved)
388
+ if language=="English":
389
+ msg="🖼️ Please refer to the section below to see the recommended results."
390
+ else:
391
+ msg="🖼️ 请到下方查看推荐结果。"
392
+ state+=[(None,msg)]
393
+
394
+ return gallery_output,state,state
395
+
396
+ elif task_type=="task 3":
397
+ gallery_output.append("recomendation_pic/3.8.png")
398
+ gallery_output.append("recomendation_pic/3.9.png")
399
+ input_image = Image.open(full_image_path).convert("RGB")
400
+ input_features = extract_features_siglip(input_image.convert("RGB"))
401
+ input_features = input_features.detach().cpu().numpy()
402
+ input_features = np.float32(input_features)
403
+ faiss.normalize_L2(input_features)
404
+ distances, indices = index.search(input_features, 2)
405
+ for i,v in enumerate(indices[0]):
406
+ sim = -distances[0][i]
407
+ image_url = df.iloc[v]["Link"]
408
+ img_retrieved = read_image_from_url(image_url)
409
+ gallery_output.append(img_retrieved)
410
+ if language=="English":
411
+ msg="🖼️ Please refer to the section below to see the recommended results."
412
+ else:
413
+ msg="🖼️ 请到下方查看推荐结果。"
414
+ state+=[(None,msg)]
415
+
416
+ return gallery_output,state,state
417
+
418
+ elif crop_image_path:
419
  input_image = Image.open(crop_image_path).convert("RGB")
420
  input_features = extract_features_siglip(input_image.convert("RGB"))
421
  input_features = input_features.detach().cpu().numpy()
 
667
  }
668
 
669
 
670
+
671
  prompt_list = [
672
  [
673
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
 
922
 
923
  async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
924
  print("narritive", narritive)
 
925
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
926
  image_input = image_input['background']
927
 
 
1669
  audio_output = await texttospeech(read_info, language)
1670
  return reason,audio_output,log_state,index,gr.update(value=[])
1671
 
1672
+ def change_naritive(session_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1673
+ if session_type=="Session 1":
1674
  return None, [], [], [[], [], []], "", None, []
1675
  else:
1676
  if language=="English":
 
1718
  )
1719
  ]
1720
 
1721
+
1722
  return image_input, state, state, click_state, paragraph, origin_image
1723
 
1724
 
 
1763
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1764
 
1765
  examples = [
1766
+ ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg","task 1"],
1767
+ ["test_images/2.Football Players.jpg","test_images/task2.jpg","task 2"],
1768
+ ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg","task 3"],
 
1769
  # ["test_images/test4.jpg"],
1770
  # ["test_images/test5.jpg"],
1771
  # ["test_images/Picture5.png"],
 
2029
  interactive=True)
2030
 
2031
  with gr.Row():
2032
+ task_type = gr.Textbox(visible=False)
2033
  gr.Examples(
2034
  examples=examples,
2035
+ inputs=[example_image,task_instuction,task_type],
2036
  )
2037
 
2038
 
 
2141
  chat_log_file = gr.File(label="Download Chat Log",scale=5)
2142
 
2143
  with gr.Row(elem_id="top_row",visible=False) as top_row:
2144
+ session_type = gr.Dropdown(
2145
  ["Session 1","Session 2"],
2146
  value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
2147
  )
 
2157
  interactive=True,
2158
  label="Generated Caption Length",
2159
  )
2160
+
2161
  # auto_play = gr.Checkbox(
2162
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2163
  # )
 
2204
  # )
2205
  recommend_btn.click(
2206
  fn=infer,
2207
+ inputs=[new_crop_save_path,image_path,state,language,task_type],
2208
  outputs=[gallery_result,chatbot,state]
2209
  )
2210
 
 
2491
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2492
  # paragraph,artist,gender,image_path])
2493
 
2494
+ image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type],
2495
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2496
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2497
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
 
2516
  # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2517
  # [chatbot, state, aux_state,output_audio])
2518
  # submit_button_text.click(lambda: "", None, chat_input)
2519
+ example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type],
2520
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2521
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2522
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
 
2666
 
2667
  naritive.change(
2668
  change_naritive,
2669
+ [session_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
2670
  [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
2671
  queue=False,
2672
  show_progress=False
2673
 
2674
  )
2675
+ def session_change():
2676
+ instruction=Image.open('test_images/task4.jpg')
2677
+ return None, [], [], [[], [], []], "", None, [],[],instruction
2678
 
2679
+ session_type.change(
2680
+ session_change,
2681
  [],
2682
+ [image_input, chatbot, state, click_state, paragraph, origin_image,history_log,log_state,task_instuction]
2683
  )
2684
 
2685
  # upvote_btn.click(