EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 30, 2024

Commit

abb5985

verified ·

1 Parent(s): 1db79b2

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -123

app.py CHANGED Viewed

@@ -347,33 +347,59 @@ def extract_features_siglip(image):
     return image_features
 @spaces.GPU
-def infer(crop_image_path,full_image_path):
-    input_image = Image.open(crop_image_path).convert("RGB")
-    input_features = extract_features_siglip(input_image.convert("RGB"))
-    input_features = input_features.detach().cpu().numpy()
-    input_features = np.float32(input_features)
-    faiss.normalize_L2(input_features)
-    distances, indices = index.search(input_features, 2)
     gallery_output = []
-    for i,v in enumerate(indices[0]):
-        sim = -distances[0][i]
-        image_url = df.iloc[v]["Link"]
-        img_retrieved = read_image_from_url(image_url)
-        gallery_output.append(img_retrieved)
-    input_image = Image.open(full_image_path).convert("RGB")
-    input_features = extract_features_siglip(input_image.convert("RGB"))
-    input_features = input_features.detach().cpu().numpy()
-    input_features = np.float32(input_features)
-    faiss.normalize_L2(input_features)
-    distances, indices = index.search(input_features, 2)
-    for i,v in enumerate(indices[0]):
-        sim = -distances[0][i]
-        image_url = df.iloc[v]["Link"]
-        img_retrieved = read_image_from_url(image_url)
-        gallery_output.append(img_retrieved)
-    return gallery_output
 ###############################################################################
@@ -530,11 +556,17 @@ css = """
     background: white !important;
     border: none !important;
     box-shadow: none !important;
 }
-info_btn_interact {
-    background: white !important;
     box-shadow: none !important;
 }
 .function_button {
@@ -590,7 +622,27 @@ prompt_list = [
     ]
 ]
 gpt_state = 0
 VOICE = "en-GB-SoniaNeural"
@@ -722,11 +774,11 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3+[gr.update(visible=False)]
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
-        return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*4
 def init_wo_openai_api_key():
         global gpt_state
@@ -801,9 +853,9 @@ async def chat_input_callback(*args):
         return state, state, None, audio,log_state,history
-def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None):
     print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
@@ -848,76 +900,60 @@ def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_a
         print('upload_callback: add caption to chatGPT memory')
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
         print(parsed_data)
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
         gender=gender.lower()
         print("gender",gender)
     if language=="English":
-        if narritive=="Third-person" :
-            state = [
-            (
-                None,
-                f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
-            )
-            ]
-        elif narritive=="Single-Persona: Artist":
-            state = [
-            (
-                None,
-                f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
-            )
-            ]
-        elif narritive=="Multi-Persona: Objects":
-            state = [
-            (
-                None,
-                f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
-            )
-            ]
     elif language=="Chinese":
-        if narritive == "Third-person":
-            state = [
-            (
-                None,
-                f"🤖 你好，我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供相关的信息。"
-            )
-            ]
-        elif narritive == "Single-Persona: Artist":
-            state = [
-            (
-                None,
-                f"🧑‍🎨 你好，我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供我的创作背后的相关见解和想法。"
-            )
-            ]
-        elif narritive == "Multi-Persona: Objects":
-            state = [
-            (
-                None,
-                f"🎨 你好，让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会从画面上事物的视角为你提供相关的见解和想法。"
-            )
-            ]
     log_state += [(name,None)]
     log_state=log_state+[(paragraph,None)]
     log_state=log_state+[(narritive,None)]
     log_state=log_state+state
     log_state = log_state + [("%% basic information %%", None)]
     history=[]
-    history.append({"role": "assistant", "content": paragraph+state[0][1]})
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
-        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
@@ -1056,7 +1092,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
         print("read info",read_info)
-        if naritive=="Item":
             parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
             parsed_data = json.loads(parsed_data)
@@ -1088,7 +1124,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
         return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
 def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
@@ -1101,8 +1137,6 @@ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, languag
         'language': language
     }
-    naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
     naritive_value=naritive_mapping[naritive]
     if mapped_value != -1:
@@ -1139,9 +1173,17 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
         history = []
     messages = history[:]
     if image_path:
-        base64_image = encode_image(image_path)
         messages.append({
             "role": "user",
             "content": [
@@ -1152,7 +1194,7 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{base64_image}"
                         }
                     }
                 ]
@@ -1176,6 +1218,10 @@ def get_gpt_response(api_key, image_path, prompt, history=None):
     print("gpt result",result)
     try:
         content = result['choices'][0]['message']['content']
         return content
     except (KeyError, IndexError, json.JSONDecodeError) as e:
         return json.dumps({"error": "Failed to parse model output", "details": str(e)})
@@ -1533,15 +1579,17 @@ async def texttospeech(text, language,gender='female'):
         return None
 # give the reason of recommendation
-async def associate(focus_info,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
     rec_path=evt._data['value']['image']['path']
     index=evt.index
     print("rec_path",rec_path)
-    prompt="""
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
-    """
-    prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
-    result=get_gpt_response(openai_api_key, rec_path, prompt)
     print("recommend result",result)
     reason = [(None, f"{result}")]
     log_state = log_state + [(narritive, None)]
@@ -1550,10 +1598,10 @@ async def associate(focus_info,openai_api_key,language,autoplay,length,log_state
     read_info = re.sub(r'[#[\]!*]','',result)
     read_info = emoji.replace_emoji(read_info,replace="")
     print("associate",read_info)
     if autoplay:
         audio_output = await texttospeech(read_info, language)
-        return reason,audio_output,log_state,index
-    return reason,None,log_state,index
 def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
     if task_type=="Session 1":
@@ -1648,9 +1696,9 @@ def create_ui():
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
-        ["test_images/1.The Ambassadors.jpg"],
-        ["test_images/2.Football Players.jpg"],
-        ["test_images/3.Along the River during the Qingming Festival.jpeg"],
         # ["test_images/test3.jpg"],
         # ["test_images/test4.jpg"],
         # ["test_images/test5.jpg"],
@@ -1704,7 +1752,9 @@ def create_ui():
             output_audio = gr.HTML(
                 label="Synthesised Audio", elem_classes="custom-output"
             )
-        with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
@@ -1735,11 +1785,7 @@ def create_ui():
                                     year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
-                                with gr.Row():
-                                    gr.Examples(
-                                examples=examples,
-                                inputs=[example_image],
-                            )
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
@@ -1824,11 +1870,6 @@ def create_ui():
             )
             with gr.Column(scale=4):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
@@ -1892,6 +1933,12 @@ def create_ui():
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as recommend:
                     gallery_result = gr.Gallery(
                     label="Recommendations",
                     height="auto",
@@ -1906,19 +1953,20 @@ def create_ui():
                     # show_share_button=True,
                     # show_download_button=True
                     )
-                    sort_rec=gr.Dropdown(["1", "2", "3", "4"],
-                                value=[],
-                                multiselect=True,
-                                    label="Score", info="Please sort the pictures according to your preference"
-                    )
             with gr.Column(scale=4,visible=False) as reco_reasons:
                 recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
                 recommend_score = gr.Radio(
-                            choices=[0,1,2,3,4,5],
                             label="Score",
                             interactive=True)
@@ -2088,14 +2136,14 @@ def create_ui():
         # )
         recommend_btn.click(
             fn=infer,
-            inputs=[new_crop_save_path,image_path],
-            outputs=[gallery_result]
             )
         gallery_result.select(
             associate,
-            inputs=[paragraph,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
-            outputs=[recommend_bot,output_audio,log_state,pic_index],
         )
@@ -2255,11 +2303,11 @@ def create_ui():
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[export,modules_need_gpt1, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
         # disable_chatGPT_button.click(init_wo_openai_api_key,
         #                              outputs=[export,modules_need_gpt1, modules_need_gpt3,
@@ -2375,11 +2423,11 @@ def create_ui():
         #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
         #                         paragraph,artist,gender,image_path])
-        image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist,gender,image_path,log_state,history_log])
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2400,11 +2448,11 @@ def create_ui():
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
-        example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                            paragraph,artist,gender,image_path, log_state,history_log])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])

     return image_features
 @spaces.GPU
+def infer(crop_image_path,full_image_path,state,language):
     gallery_output = []
+    if crop_image_path:
+        input_image = Image.open(crop_image_path).convert("RGB")
+        input_features = extract_features_siglip(input_image.convert("RGB"))
+        input_features = input_features.detach().cpu().numpy()
+        input_features = np.float32(input_features)
+        faiss.normalize_L2(input_features)
+        distances, indices = index.search(input_features, 2)
+        for i,v in enumerate(indices[0]):
+            sim = -distances[0][i]
+            image_url = df.iloc[v]["Link"]
+            img_retrieved = read_image_from_url(image_url)
+            gallery_output.append(img_retrieved)
+        input_image = Image.open(full_image_path).convert("RGB")
+        input_features = extract_features_siglip(input_image.convert("RGB"))
+        input_features = input_features.detach().cpu().numpy()
+        input_features = np.float32(input_features)
+        faiss.normalize_L2(input_features)
+        distances, indices = index.search(input_features, 2)
+        for i,v in enumerate(indices[0]):
+            sim = -distances[0][i]
+            image_url = df.iloc[v]["Link"]
+            img_retrieved = read_image_from_url(image_url)
+            gallery_output.append(img_retrieved)
+        if language=="English":
+            msg="🖼️ Please refer to the section below to see the recommended results."
+        else:
+            msg="🖼️  请到下方查看推荐结果。"
+        state+=[(None,msg)]
+        return gallery_output,state,state
+    else:
+        input_image = Image.open(full_image_path).convert("RGB")
+        input_features = extract_features_siglip(input_image.convert("RGB"))
+        input_features = input_features.detach().cpu().numpy()
+        input_features = np.float32(input_features)
+        faiss.normalize_L2(input_features)
+        distances, indices = index.search(input_features, 4)
+        for i,v in enumerate(indices[0]):
+            sim = -distances[0][i]
+            image_url = df.iloc[v]["Link"]
+            img_retrieved = read_image_from_url(image_url)
+            gallery_output.append(img_retrieved)
+        if language=="English":
+            msg="🖼️ Please refer to the section below to see the recommended results."
+        else:
+            msg="🖼️  请到下方查看推荐结果。"
+        state+=[(None,msg)]
+        return gallery_output,state,state
 ###############################################################################
     background: white !important;
     border: none !important;
     box-shadow: none !important;
+    font-size: 15px !important;
+    min-width: 6rem !important;
+    max-width: 10rem !important;
 }
+.info_btn_interact {
+    background: rgb(242, 240, 233) !important;
     box-shadow: none !important;
+    font-size: 15px !important;
+    min-width: 6rem !important;
+    max-width: 10rem !important;
 }
 .function_button {
     ]
 ]
+recommendation_prompt=[
+    '''I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:Recommendation reason: {{Recommendation based on objects in the image or Recommendation based on overall visual similarity}}
+    Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1.Each bullet point should be in {language} language, with a response length of about {length} words.''',
+    '''
+    When generating the answer, you should tell others that you are the creators of the first paintings and generate the text in the tone and manner as if you are the creator of the painting.
+    I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
+    Recommendation reason: {{ As the author of the first painting, I recommend based on the object I painted OR As the author of the first painting, I recommend based on the overall similarity in appearance}}
+    Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I.
+    Each bullet point should be in {language} language, with a response length of about {length} words.
+    ''',
+    '''
+    When generating answers, you should tell people that you are the object itself that was selected in the first painting, and generate text in the tone and manner in which you are the object
+    I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
+    Recommendation reason: {{As an object in the first painting, I am recommending based on myself OR As an object in the first painting, I am recommending based on the overall similarity of the first painting's appearance}}
+    Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I.
+    Each bullet point should be in {language} language, with a response length of about {length} words.
+    '''
+ ]
 gpt_state = 0
 VOICE = "en-GB-SoniaNeural"
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*4+[gr.update(visible=False)]
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
+        return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*5
 def init_wo_openai_api_key():
         global gpt_state
         return state, state, None, audio,log_state,history
+async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1"):
     print("narritive", narritive)
+    print("image input",image_input)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
         print('upload_callback: add caption to chatGPT memory')
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
+        print("img_path",new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
         print(parsed_data)
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
         gender=gender.lower()
         print("gender",gender)
     if language=="English":
+        if naritive_mapping[narritive]==0 :
+            msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
+        elif naritive_mapping[narritive]==1:
+            msg=f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
+        elif naritive_mapping[narritive]==2:
+            msg=f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
     elif language=="Chinese":
+        if naritive_mapping[narritive]==0:
+            msg=f"🤖 你好，我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供相关的信息。"
+        elif naritive_mapping[narritive]==1:
+            msg=f"🧑‍🎨 你好，我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供我的创作背后的相关见解和想法。"
+        elif naritive_mapping[narritive]==2:
+            msg=f"🎨 你好，让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会从画面上事物的视角为你提供相关的见解和想法。"
+    state = [(msg,None)]
     log_state += [(name,None)]
     log_state=log_state+[(paragraph,None)]
     log_state=log_state+[(narritive,None)]
     log_state=log_state+state
     log_state = log_state + [("%% basic information %%", None)]
+    read_info=emoji.replace_emoji(msg,replace="")
     history=[]
+    history.append({"role": "assistant", "content": paragraph+msg})
+    audio_output = None
+    if autoplay:
+        audio_output = await texttospeech(read_info, language,gender)
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
+        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output]
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
         print("read info",read_info)
+        if naritive_mapping[naritive]==2:
             parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
             parsed_data = json.loads(parsed_data)
         return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
+naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
 def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
         'language': language
     }
     naritive_value=naritive_mapping[naritive]
     if mapped_value != -1:
         history = []
     messages = history[:]
+    base64_images = []
     if image_path:
+        if isinstance(image_path, list):
+            for img in image_path:
+                base64_image = encode_image(img)
+                base64_images.append(base64_image)
+        else:
+            base64_image = encode_image(image_path)
+            base64_images.append(base64_image)
         messages.append({
             "role": "user",
             "content": [
                     {
                         "type": "image_url",
                         "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_images}"
                         }
                     }
                 ]
     print("gpt result",result)
     try:
         content = result['choices'][0]['message']['content']
+        if content.startswith("```json"):
+                content = content[7:]
+        if content.endswith("```"):
+            content = content[:-3]
         return content
     except (KeyError, IndexError, json.JSONDecodeError) as e:
         return json.dumps({"error": "Failed to parse model output", "details": str(e)})
         return None
 # give the reason of recommendation
+async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
+    persona=naritive_mapping[narritive]
     rec_path=evt._data['value']['image']['path']
     index=evt.index
     print("rec_path",rec_path)
+    prompt=recommendation_prompt[persona].format(language=language,length=length)
+    if new_crop:
+        image_paths=[new_crop,rec_path]
+    else:
+        image_paths=[image_path,rec_path]
+    result=get_gpt_response(openai_api_key, image_paths, prompt)
     print("recommend result",result)
     reason = [(None, f"{result}")]
     log_state = log_state + [(narritive, None)]
     read_info = re.sub(r'[#[\]!*]','',result)
     read_info = emoji.replace_emoji(read_info,replace="")
     print("associate",read_info)
+    audio_output=None
     if autoplay:
         audio_output = await texttospeech(read_info, language)
+    return reason,audio_output,log_state,index,gr.update(value=[])
 def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
     if task_type=="Session 1":
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
+        ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
+        ["test_images/2.Football Players.jpg","test_images/task2.jpg"],
+        ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
         # ["test_images/test3.jpg"],
         # ["test_images/test4.jpg"],
         # ["test_images/test5.jpg"],
             output_audio = gr.HTML(
                 label="Synthesised Audio", elem_classes="custom-output"
             )
+        with gr.Row():
+            with gr.Column(scale=1,min_width=50,visible=False) as instruct:
+                task_instuction=gr.Image(type="pil", interactive=True, elem_classes="task_instruct",height=650,label=None)
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                                     year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
             )
             with gr.Column(scale=4):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as recommend:
+                    sort_rec=gr.Dropdown(["1", "2", "3", "4"],
+                                value=[],
+                                multiselect=True,
+                                    label="Score", info="Please sort the pictures according to your preference"
+                    )
                     gallery_result = gr.Gallery(
                     label="Recommendations",
                     height="auto",
                     # show_share_button=True,
                     # show_download_button=True
                     )
             with gr.Column(scale=4,visible=False) as reco_reasons:
                 recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
                 recommend_score = gr.Radio(
+                            choices=[1,2,3,4,5,6,7],
                             label="Score",
                             interactive=True)
+        with gr.Row():
+            gr.Examples(
+        examples=examples,
+        inputs=[example_image,task_instuction],
+        )
         # )
         recommend_btn.click(
             fn=infer,
+            inputs=[new_crop_save_path,image_path,state,language],
+            outputs=[gallery_result,chatbot,state]
             )
         gallery_result.select(
             associate,
+            inputs=[image_path,new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
+            outputs=[recommend_bot,output_audio,log_state,pic_index,recommend_score],
         )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[export,modules_need_gpt1, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3])
         # disable_chatGPT_button.click(init_wo_openai_api_key,
         #                              outputs=[export,modules_need_gpt1, modules_need_gpt3,
         #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
         #                         paragraph,artist,gender,image_path])
+        image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,task_type],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                                paragraph,artist,gender,image_path,log_state,history_log,output_audio])
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
+        example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,task_type],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                            paragraph,artist,gender,image_path, log_state,history_log,output_audio])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])