EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 29, 2024

Commit

9ef960d

verified ·

1 Parent(s): 10d7824

Update app.py

Browse files

Files changed (1) hide show

app.py +565 -445

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from io import BytesIO
 import io
 from math import inf
@@ -346,20 +347,33 @@ def extract_features_siglip(image):
     return image_features
 @spaces.GPU
-def infer(image_path):
-  input_image = Image.open(image_path).convert("RGB")
-  input_features = extract_features_siglip(input_image.convert("RGB"))
-  input_features = input_features.detach().cpu().numpy()
-  input_features = np.float32(input_features)
-  faiss.normalize_L2(input_features)
-  distances, indices = index.search(input_features, 3)
-  gallery_output = []
-  for i,v in enumerate(indices[0]):
-    sim = -distances[0][i]
-    image_url = df.iloc[v]["Link"]
-    img_retrieved = read_image_from_url(image_url)
-    gallery_output.append(img_retrieved)
-  return gallery_output
 ###############################################################################
@@ -547,28 +561,14 @@ filtered_language_dict = {
     'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
 }
-focus_map = {
-"D":0,
-"DA":1,
-"DAI":2,
 "Judge":3
 }
-'''
-prompt_list = [
-'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
-]
-prompt_list = [
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
-]
-'''
 prompt_list = [
     [
         'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
@@ -722,11 +722,11 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
@@ -734,6 +734,7 @@ def init_wo_openai_api_key():
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
     if click_mode == 'Continuous':
@@ -771,35 +772,38 @@ def update_click_state(click_state, caption, click_mode):
         raise NotImplementedError
 async def chat_input_callback(*args):
-    visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input = args
     message = chat_input["text"]
     prompt="Please help me answer the question with this painting {question} in {language}."
     prompt=prompt.format(question=message, language=language)
-    state = state + [(message,None)]
     if visual_chatgpt is not None:
-        result=get_gpt_response(api_key, image_input,prompt+message)
-        state = state + [(None, result)]
         read_info = re.sub(r'[#[\]!*]','',result)
         read_info = emoji.replace_emoji(read_info,replace="")
-        # state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
-        last_text, last_response = state[-1]
-        print("last response",last_response)
         if autoplay==False:
-            return state, state, aux_state, None
         else:
-            audio = await texttospeech(read_info,language,autoplay,gender)
-            return state, state, aux_state, audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
-        audio = await texttospeech(response,language,autoplay,gender)
-        return state, state, None, audio
-def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
     print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
@@ -810,6 +814,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
     click_state = [[], [], []]
     image_input = image_resize(image_input, res=1024)
     model = build_caption_anything_with_models(
@@ -831,18 +850,14 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
-        # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
-        Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
-        AI_prompt = "Received."
-        visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
-        visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
-        print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
         parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
         gender=gender.lower()
         print("gender",gender)
     if language=="English":
@@ -888,13 +903,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
                 None,
                 f"🎨 你好，让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会从画面上事物的视角为你提供相关的见解和想法。"
             )
-            ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
-        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path]
@@ -933,20 +956,23 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
-    # state = state + [("Selected image point: {}, Input label: {}".format(
-    #     prompt["input_point"],
-    #     '+' if prompt["input_label"] == "1" else '-'
-    # ), None)]
-    output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
-    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
-    # update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
-    text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
@@ -957,62 +983,75 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     out_state = out
     if visual_chatgpt is not None:
-        print('inference_click: add caption to chatGPT memory')
         new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
         Image.open(out["crop_save_path"]).save(new_crop_save_path)
-        point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
-        visual_chatgpt.point_prompt = point_prompt
-    print("new crop save",new_crop_save_path)
     yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
-query_focus = {
-    "D": "Provide a description of the item.",
-    "DA": "Provide a description and analysis of the item.",
-    "DAI": "Provide a description, analysis, and interpretation of the item.",
-    "Judge": "Evaluate the item."
-}
 async def submit_caption(naritive, state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-                   autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender):
-    state = state + [(query_focus[focus_type], None)]
     click_index = click_index_state
-    # if pre_click_index==click_index:
-    #     click_index = (click_index[0] - 1, click_index[1] - 1)
-    #     pre_click_index = click_index
-    # else:
-    #     pre_click_index = click_index
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
     prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
     print("Prompt:", prompt)
     print("click",click_index)
-    # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
-    #                                   input_points=input_points, input_labels=input_labels)
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
         print("new crop save",new_crop_save_path)
-        focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
         # state = state + [(None, f"Wiki: {paragraph}")]
-        state = state + [(None, f"{focus_info}")]
         print("new_cap",focus_info)
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
@@ -1028,27 +1067,25 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
                 print("error gpt responese")
             print("item gender",gender)
-        # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
-        #                                           input_points=input_points, input_labels=input_labels)
         try:
             if autoplay==False:
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
-            audio_output = await texttospeech(read_info, language, autoplay,gender)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
         except Exception as e:
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
     else:
         state = state + [(None, f"Error during TTS prediction: {str(e)}")]
         print(f"Error during TTS prediction: {str(e)}")
-        return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None
@@ -1088,70 +1125,57 @@ def encode_image(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
-def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {api_key}"
-    }
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
-    base64_images=[]
     if image_path:
-        if isinstance(image_path, list):
-            for img in image_path:
-                base64_image = encode_image(img)
-                base64_images.append(base64_image)
-        else:
-            base64_image = encode_image(image_path)
-            base64_images.append(base64_image)
-            payload = {
-                "model": "gpt-4o",
-                "messages": [
                     {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": prompt
-                            },
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/jpeg;base64,{base64_images}"
-                                }
-                            }
-                        ]
-                    }
-                ],
-                "max_tokens": 300
-            }
-    else:
-        payload = {
-            "model": "gpt-4o",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prompt
                         }
-                    ]
-                }
-            ],
-            "max_tokens": 300
-        }
     # Sending the request to the OpenAI API
     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
     result = response.json()
     try:
         content = result['choices'][0]['message']['content']
         return content
@@ -1295,61 +1319,62 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
             visual_chatgpt.global_prompt = ""
-def export_chat_log(chat_state, paragraph, liked, disliked,log_list):
     try:
         if not chat_state:
             return None
-        chat_log = f"Image Description: {paragraph}\n\n"
         for entry in chat_state:
             user_message, bot_response = entry
             if user_message and bot_response:
                 chat_log += f"User: {user_message}\nBot: {bot_response}\n"
             elif user_message:
                 chat_log += f"User: {user_message}\n"
             elif bot_response:
                 chat_log += f"Bot: {bot_response}\n"
-        # 添加 liked 和 disliked 信息
-        chat_log += "\nLiked Responses:\n"
-        for response in liked:
-            chat_log += f"{response}\n"
-        chat_log += "\nDisliked Responses:\n"
-        for response in disliked:
-            chat_log += f"{response}\n"
-        print("export log...")
-        print("chat_log", chat_log)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
-            temp_file.write(chat_log.encode('utf-8'))
-            temp_file_path = temp_file.name
-            print(temp_file_path)
-        log_list.append(temp_file_path)
         return log_list,log_list
     except Exception as e:
         print(f"An error occurred while exporting the chat log: {e}")
-        return None
-async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
     prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
     res=get_gpt_response(api_key,None,prompt)
     state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
     if autoplay:
-        audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-        return state, state,audio_output
-    return state, state,None
-async def get_yearinfo(year,api_key,state,language,autoplay,length):
     prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
     res=get_gpt_response(api_key,None,prompt)
     state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
@@ -1358,47 +1383,47 @@ async def get_yearinfo(year,api_key,state,language,autoplay,length):
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
     if autoplay:
-        audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-        return state, state,audio_output
-    return state, state,None
-async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
-    # state = state + [(None, f"Caption Everything: {paragraph}")]
-    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
-    AI_prompt = "Received."
-    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
-    # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-    visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
-    # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
-    audio_output=await texttospeech(paragraph,language,autoplay)
-    return paragraph,audio_output
-def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        ocr_reader=shared_ocr_reader,
-        text_refiner=text_refiner,
-        session_id=iface.app_id
-    )
-    paragraph = model.inference_cap_everything(image_input, verbose=True)
-    # state = state + [(None, f"Caption Everything: {paragraph}")]
-    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
-    AI_prompt = "Received."
-    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
-    visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
-    # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-    return paragraph
@@ -1490,62 +1515,119 @@ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragr
 #     return like_state, dislike_state
-async def texttospeech(text, language, autoplay,gender='female'):
     try:
-        if autoplay:
-            voice = filtered_language_dict[language][gender]
-            communicate = edge_tts.Communicate(text=text, voice=voice,rate="+25%")
-            file_path = "output.wav"
-            await communicate.save(file_path)
-            with open(file_path, "rb") as audio_file:
-                audio_bytes = BytesIO(audio_file.read())
-            audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
-            print("TTS processing completed.")
-            audio_style = 'style="width:210px;"'
-            audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
-        else:
-            audio_player = None
-            print("Autoplay is disabled.")
         return audio_player
     except Exception as e:
         print(f"Error in texttospeech: {e}")
         return None
-async def associate(focus_info,openai_api_key,language,state,autoplay,length, evt: gr.SelectData):
     rec_path=evt._data['value']['image']['path']
     print("rec_path",rec_path)
     prompt="""
     'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
     """
     prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
     result=get_gpt_response(openai_api_key, rec_path, prompt)
-    state = state + [(None, f"{result}")]
     read_info = re.sub(r'[#[\]!*]','',result)
     read_info = emoji.replace_emoji(read_info,replace="")
     print("associate",read_info)
     if autoplay:
-        audio_output = await texttospeech(read_info, language, autoplay)
-        return state,state,audio_output
-    return state,state,None
-def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
     print(x.index, x.value, x.liked)
     if x.liked == True:
         print("liked")
-        like_res.append(x.value)
-        print(like_res)
         state = state + [(None, f"Liked Received 👍")]
     else:
-        dislike_res.append(x.value)
         state = state + [(None, f"Disliked Received 👎")]
-    return like_res,dislike_res,state
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
@@ -1568,13 +1650,13 @@ def create_ui():
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
-        ["test_images/ambass.jpg"],
-        ["test_images/test1.jpg"],
-        ["test_images/test2.jpg"],
-        ["test_images/test3.jpg"],
-        ["test_images/test4.jpg"],
-        ["test_images/test5.jpg"],
-        ["test_images/Picture5.png"],
     ]
@@ -1582,7 +1664,13 @@ def create_ui():
             css=css,
             theme=gr.themes.Base()
     ) as iface:
         state = gr.State([])
         out_state = gr.State(None)
         click_state = gr.State([[], [], []])
         origin_image = gr.State(None)
@@ -1597,49 +1685,32 @@ def create_ui():
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
         artist=gr.State(None)
-        like_res=gr.State([])
-        dislike_res=gr.State([])
         gr.Markdown(title)
         gr.Markdown(description)
         point_prompt = gr.State("Positive")
         log_list=gr.State([])
         gender=gr.State('female')
         image_path=gr.State('')
-        # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
-        #     with gr.Column(scale=0.5):
-        #         # gr.Markdown("Left side content")
-        #     with gr.Column(scale=0.5):
-        #         with gr.Row(align="right",visible=False) as language_select:
-        #             language = gr.Dropdown(
-        #                 ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
-        #                 value="English", label="Language", interactive=True)
-        #         with gr.Row(align="right",visible=False) as autoplay:
-        #             auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
-        #             output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
-        # with gr.Row(align="right",visible=False) as language_select:
-        #     language = gr.Dropdown(
-        #         ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
-        #         value="English", label="Language", interactive=True)
-        # with gr.Row(align="right",visible=False) as autoplay:
-        #     auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
-        #     output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
-                    with gr.Tab("Base(GPT Power)") as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1647,7 +1718,7 @@ def create_ui():
                             year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
-                    with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
@@ -1665,6 +1736,12 @@ def create_ui():
                                     artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
                                     year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
@@ -1673,10 +1750,9 @@ def create_ui():
                                 add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
                                 minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
-                                clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
-                                focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
-                                focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
-                                focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
                                 focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
                                 recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
@@ -1743,28 +1819,13 @@ def create_ui():
                                 value="No",
                                 label="Expert",
                                 interactive=True)
-                with gr.Column(visible=False) as recommend:
-                    gallery_result = gr.Gallery(
-                    label="Result",
-                    height="auto",
-                    columns=4
-                    # columns=4,
-                    # rows=2,
-                    # show_label=False,
-                    # allow_preview=True,
-                    # object_fit="contain",
-                    # height="auto",
-                    # preview=True,
-                    # show_share_button=True,
-                    # show_download_button=True
-                )
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
                 examples=examples,
                 inputs=[example_image],
             )
@@ -1780,20 +1841,29 @@ def create_ui():
                         type="password")
                     with gr.Row():
                         enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
-                        disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
-                                                        variant='primary')
                 with gr.Column(visible=False) as module_notification_box:
                     notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
-                with gr.Column() as modules_need_gpt0:
-                    with gr.Column(visible=False) as modules_need_gpt2:
-                        paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
-                        cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
-                with gr.Column(visible=False) as modules_not_need_gpt2:
                     with gr.Blocks():
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
-                        with gr.Column(visible=False) as modules_need_gpt3:
                             chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Chat", interactive=True)
@@ -1801,13 +1871,9 @@ def create_ui():
                                 # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
                                 # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
                                 # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
-                            with gr.Row():
-                                naritive = gr.Radio(
-                                choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
-                                value="Third-person",
-                                label="narritive",
-                                scale=5,
-                                interactive=True)
                 # TTS interface hidden initially
@@ -1824,10 +1890,46 @@ def create_ui():
                 with gr.Row():
                     submit_tts = gr.Button(value="Submit", interactive=True)
                     clear_tts = gr.Button(value="Clear", interactive=True)
         ###############################################################################
         ############# this part is for text to image #############
         ###############################################################################
         with gr.Row(variant="panel",visible=False) as text2image_model:
             with gr.Column():
@@ -1922,10 +2024,14 @@ def create_ui():
             #         # show_download_button=True
             #     )
-        with gr.Row():
             chat_log_file = gr.File(label="Download Chat Log",scale=5)
-        with gr.Row(visible=False, elem_id="top_row") as top_row:
             language = gr.Dropdown(
             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
             value="English", label="Language", interactive=True, elem_classes="custom-language"
@@ -1938,12 +2044,12 @@ def create_ui():
                                 interactive=True,
                                 label="Generated Caption Length",
                             )
-            auto_play = gr.Checkbox(
-            label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
-        )
-            output_audio = gr.HTML(
-                label="Synthesised Audio", elem_classes="custom-output"
-            )
@@ -1984,14 +2090,14 @@ def create_ui():
         # )
         recommend_btn.click(
             fn=infer,
-            inputs=[new_crop_save_path],
             outputs=[gallery_result]
             )
         gallery_result.select(
             associate,
-            inputs=[paragraph,openai_api_key,language,state,auto_play,length],
-            outputs=[chatbot,state,output_audio],
         )
@@ -2093,7 +2199,7 @@ def create_ui():
         # mv_images = gr.State()
-        # chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
         # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
         #     fn=generate_mvs,
@@ -2138,81 +2244,80 @@ def create_ui():
         #     queue=False,
         #     show_progress=False
         # )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
-                              outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
-                                    outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
-        # openai_api_key.submit(init_openai_api_key,
-        #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-        #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
-        # enable_chatGPT_button.click(init_openai_api_key,
-        #                             outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
-        #                                      modules_not_need_gpt,
-        #                                      modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
-        disable_chatGPT_button.click(init_wo_openai_api_key,
-                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
-                                              modules_not_need_gpt,
-                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
-        artist_label_base2.click(
-            get_artistinfo,
-            inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
-        )
         artist_label.click(
             get_artistinfo,
-            inputs=[artist_label,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
-        )
-        artist_label_traj.click(
-            get_artistinfo,
-            inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
         )
-        year_label_base2.click(
-            get_yearinfo,
-            inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
-        )
         year_label.click(
             get_yearinfo,
-            inputs=[year_label,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
-        )
-        year_label_traj.click(
-            get_yearinfo,
-            inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
-            outputs=[chatbot,state,output_audio]
         )
-        enable_chatGPT_button.click(
-            lambda: (None, [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
-            queue=False,
-            show_progress=False
-        )
-        openai_api_key.submit(
-            lambda: (None, [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
-            queue=False,
-            show_progress=False
-        )
-        cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
-                                    [paragraph_output,output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
@@ -2222,53 +2327,61 @@ def create_ui():
             show_progress=False
         )
         clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
-        clear_button_image.click(
-            lambda: (None, [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
-            queue=False,
-            show_progress=False
-        )
-        clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
         clear_button_text.click(
-            lambda: ([], [], [[], [], [], []]),
             [],
-            [chatbot, state, click_state],
             queue=False,
             show_progress=False
         )
         clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
         image_input.clear(
-            lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
         image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
-        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
-                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist,gender,image_path])
-        image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
-                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist,gender,image_path])
-        image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist,gender,image_path])
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2282,45 +2395,45 @@ def create_ui():
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
         #                       [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
-        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path],
-                          [chatbot, state, aux_state,output_audio])
         # chat_input.submit(lambda: "", None, chat_input)
         chat_input.submit(lambda: {"text": ""}, None, chat_input)
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
-        example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key,language,naritive],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                            paragraph,artist,gender,image_path])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
-        def on_click_tab_selected():
-            if gpt_state ==1:
-                print(gpt_state)
-                print("using gpt")
-                return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
-            else:
-                print("no gpt")
-                print("gpt_state",gpt_state)
-                return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
-        def on_base_selected():
-            if gpt_state ==1:
-                print(gpt_state)
-                print("using gpt")
-                return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
-            else:
-                print("no gpt")
-                return [gr.update(visible=False)]*4
-        traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
-        click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
-        base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
-        base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
@@ -2330,7 +2443,7 @@ def create_ui():
             inputs=[
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
-                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
             outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
@@ -2341,10 +2454,10 @@ def create_ui():
             submit_caption,
             inputs=[
         naritive, state,length, sentiment, factuality, language,
-        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
     ],
             outputs=[
-                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
             ],
             show_progress=True,
             queue=True
@@ -2358,11 +2471,12 @@ def create_ui():
         submit_caption,
         inputs=[
         naritive,state,length, sentiment, factuality, language,
-        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
         ],
         outputs=[
-            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
-        ],
         show_progress=True,
         queue=True
         )
@@ -2373,10 +2487,10 @@ def create_ui():
         inputs=[
         naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
         ],
         outputs=[
-            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
         ],
         show_progress=True,
         queue=True
@@ -2388,10 +2502,10 @@ def create_ui():
         inputs=[
         naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
         ],
         outputs=[
-            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
         ],
         show_progress=True,
         queue=True
@@ -2431,20 +2545,26 @@ def create_ui():
         export_button.click(
             export_chat_log,
-            inputs=[state,paragraph,like_res,dislike_res,log_list],
             outputs=[chat_log_file,log_list],
             queue=True
         )
         naritive.change(
-            lambda: (None, [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
         # upvote_btn.click(
         #     handle_liked,
         #     inputs=[state,like_res],

+import datetime
 from io import BytesIO
 import io
 from math import inf
     return image_features
 @spaces.GPU
+def infer(crop_image_path,full_image_path):
+    input_image = Image.open(crop_image_path).convert("RGB")
+    input_features = extract_features_siglip(input_image.convert("RGB"))
+    input_features = input_features.detach().cpu().numpy()
+    input_features = np.float32(input_features)
+    faiss.normalize_L2(input_features)
+    distances, indices = index.search(input_features, 2)
+    gallery_output = []
+    for i,v in enumerate(indices[0]):
+        sim = -distances[0][i]
+        image_url = df.iloc[v]["Link"]
+        img_retrieved = read_image_from_url(image_url)
+        gallery_output.append(img_retrieved)
+    input_image = Image.open(full_image_path).convert("RGB")
+    input_features = extract_features_siglip(input_image.convert("RGB"))
+    input_features = input_features.detach().cpu().numpy()
+    input_features = np.float32(input_features)
+    faiss.normalize_L2(input_features)
+    distances, indices = index.search(input_features, 2)
+    for i,v in enumerate(indices[0]):
+        sim = -distances[0][i]
+        image_url = df.iloc[v]["Link"]
+        img_retrieved = read_image_from_url(image_url)
+        gallery_output.append(img_retrieved)
+    return gallery_output
 ###############################################################################
     'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
 }
+focus_map = {
+"Describe":0,
+"D+Analysis":1,
+"DA+Interprete":2,
 "Judge":3
 }
 prompt_list = [
     [
         'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3+[gr.update(visible=False)]
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
+        return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*4
 def init_wo_openai_api_key():
         global gpt_state
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
     if click_mode == 'Continuous':
         raise NotImplementedError
 async def chat_input_callback(*args):
+    visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history = args
     message = chat_input["text"]
     prompt="Please help me answer the question with this painting {question} in {language}."
     prompt=prompt.format(question=message, language=language)
     if visual_chatgpt is not None:
+        result=get_gpt_response(api_key, image_input,prompt+message,history)
         read_info = re.sub(r'[#[\]!*]','',result)
         read_info = emoji.replace_emoji(read_info,replace="")
+        state = state + [(message,result)]
+        log_state += [(message,result)]
+        # log_state += [("%% chat messahe %%",None)]
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": result})
         if autoplay==False:
+            return state, state, aux_state, None,log_state,history
         else:
+            audio = await texttospeech(read_info,language,gender)
+            return state, state, aux_state, audio,log_state,history
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
+        audio = await texttospeech(response,language,gender)
+        return state, state, None, audio,log_state,history
+def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None):
     print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
     click_state = [[], [], []]
+    # width, height = image_input.size
+    # target_width=500
+    # target_height=650
+    # width_ratio = target_width / width
+    # height_ratio = target_height / height
+    # ratio = min(width_ratio, height_ratio)
+    # if ratio < 1.0:
+    #     new_size = (int(width * ratio), int(height * ratio))
+    #     image_input = image_input.resize(new_size, Image.ANTIALIAS)
     image_input = image_resize(image_input, res=1024)
     model = build_caption_anything_with_models(
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
         paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
         parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
+        print(parsed_data)
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
         gender=gender.lower()
         print("gender",gender)
     if language=="English":
                 None,
                 f"🎨 你好，让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会从画面上事物的视角为你提供相关的见解和想法。"
             )
+            ]
+    log_state += [(name,None)]
+    log_state=log_state+[(paragraph,None)]
+    log_state=log_state+[(narritive,None)]
+    log_state=log_state+state
+    log_state = log_state + [("%% basic information %%", None)]
+    history=[]
+    history.append({"role": "assistant", "content": paragraph+state[0][1]})
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
+        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
+    print(prompt["input_label"][-1])
+    if language=="English":
+        if prompt["input_label"][-1]==1:
+            msg="You've added an area at {}. ".format(prompt["input_point"][-1])
+        else:
+            msg="You've removed an area at {}. ".format(prompt["input_point"][-1])
+    else:
+        if prompt["input_label"][-1]==1:
+            msg="你添加了在 {} 的区域。 ".format(prompt["input_point"][-1])
+        else:
+            msg="你删除了在 {} 的区域。 ".format(prompt["input_point"][-1])
+    state = state + [(msg, None)]
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
     out_state = out
     if visual_chatgpt is not None:
         new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
         Image.open(out["crop_save_path"]).save(new_crop_save_path)
+        print("new crop save",new_crop_save_path)
     yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
+query_focus_en = [
+    "Provide a description of the item.",
+    "Provide a description and analysis of the item.",
+    "Provide a description, analysis, and interpretation of the item.",
+    "Evaluate the item."
+]
+query_focus_zh = [
+    "请描述一下这个物体。",
+    "请描述和分析一下这个物体。",
+    "请描述、分析和解释一下这个物体。",
+    "请以艺术鉴赏的角度评价一下这个物体。"
+]
 async def submit_caption(naritive, state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender,log_state,history):
+    focus_value=focus_map[focus_type]
     click_index = click_index_state
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
     prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
+    log_state =  log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
     print("Prompt:", prompt)
     print("click",click_index)
+    log_state = log_state + [(naritive, None)]
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
         print("new crop save",new_crop_save_path)
+        focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt,history)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
         # state = state + [(None, f"Wiki: {paragraph}")]
+        if language=="English":
+            user_query=query_focus_en[focus_value]
+        else:
+            user_query=query_focus_zh[focus_value]
+        state = state + [(user_query, f"{focus_info}")]
+        log_state = log_state + [(user_query, None)]
+        log_state = log_state + [(None, f"{focus_info}")]
+        # save history
+        history.append({"role": "user", "content": user_query})
+        history.append({"role": "assistant", "content": focus_info})
         print("new_cap",focus_info)
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
                 print("error gpt responese")
             print("item gender",gender)
         try:
             if autoplay==False:
+                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,log_state,history
+            audio_output = await texttospeech(read_info, language,gender)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
         except Exception as e:
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
     else:
         state = state + [(None, f"Error during TTS prediction: {str(e)}")]
         print(f"Error during TTS prediction: {str(e)}")
+        return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
+def get_gpt_response(api_key, image_path, prompt, history=None):
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
+    if history:
+        if len(history) > 4:
+            history = history[-4:]
+    else:
+        history = []
+    messages = history[:]
     if image_path:
+        base64_image = encode_image(image_path)
+        messages.append({
+            "role": "user",
+            "content": [
                     {
+                        "type": "text",
+                        "text": prompt
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}"
                         }
+                    }
+                ]
+        })
+    else:
+        messages.append({"role": "user",
+            "content":
+                    {
+                        "type": "text",
+                        "text": prompt
+                    }})
+    payload = {
+        "model": "gpt-4o",
+        "messages": messages,
+        "max_tokens": 600
+    }
     # Sending the request to the OpenAI API
     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
     result = response.json()
+    print("gpt result",result)
     try:
         content = result['choices'][0]['message']['content']
         return content
             visual_chatgpt.global_prompt = ""
+def export_chat_log(chat_state,log_list,narrative):
     try:
+        chat_log=""
         if not chat_state:
             return None
         for entry in chat_state:
             user_message, bot_response = entry
             if user_message and bot_response:
                 chat_log += f"User: {user_message}\nBot: {bot_response}\n"
+            elif user_message and user_message.startswith("%%"):
+                chat_log += f"{user_message}\n"
             elif user_message:
                 chat_log += f"User: {user_message}\n"
+                chat_log += f"///// \n"
             elif bot_response:
                 chat_log += f"Bot: {bot_response}\n"
+                chat_log += f"///// \n"
+        print("export log...")
+        current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        file_name = f"{current_time}_{narrative}.txt"
+        file_path = os.path.join(os.getcwd(), file_name)  # Save to the current working directory
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write(chat_log)
+        print(file_path)
+        log_list.append(file_path)
         return log_list,log_list
     except Exception as e:
         print(f"An error occurred while exporting the chat log: {e}")
+        return None,None
+async def get_artistinfo(artist_name,api_key,state,language,autoplay,length,log_state):
     prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
     res=get_gpt_response(api_key,None,prompt)
     state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
+    log_state=log_state+[(f"res", None)]
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
     if autoplay:
+        audio_output = await texttospeech(read_info, language)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+        return state, state,audio_output,log_state
+    return state, state,None,log_state
+async def get_yearinfo(year,api_key,state,language,autoplay,length,log_state):
     prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
     res=get_gpt_response(api_key,None,prompt)
+    log_state=log_state+[(f"res", None)]
     state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
     if autoplay:
+        audio_output = await texttospeech(read_info, language)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+        return state, state,audio_output,log_state
+    return state, state,None,log_state
+# async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
+#     # state = state + [(None, f"Caption Everything: {paragraph}")]
+#     Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
+#     AI_prompt = "Received."
+#     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+#     # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+#     visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
+#     # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
+#     audio_output=await texttospeech(paragraph,language,autoplay)
+#     return paragraph,audio_output
+# def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
+#     model = build_caption_anything_with_models(
+#         args,
+#         api_key="",
+#         captioner=shared_captioner,
+#         sam_model=shared_sam_model,
+#         ocr_reader=shared_ocr_reader,
+#         text_refiner=text_refiner,
+#         session_id=iface.app_id
+#     )
+#     paragraph = model.inference_cap_everything(image_input, verbose=True)
+#     # state = state + [(None, f"Caption Everything: {paragraph}")]
+#     Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
+#     AI_prompt = "Received."
+#     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+#     visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
+#     # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+#     return paragraph
 #     return like_state, dislike_state
+async def texttospeech(text, language,gender='female'):
     try:
+        voice = filtered_language_dict[language][gender]
+        communicate = edge_tts.Communicate(text=text, voice=voice,rate="+25%")
+        file_path = "output.wav"
+        await communicate.save(file_path)
+        with open(file_path, "rb") as audio_file:
+            audio_bytes = BytesIO(audio_file.read())
+        audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
+        print("TTS processing completed.")
+        audio_style = 'style="width:210px;"'
+        audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
         return audio_player
     except Exception as e:
         print(f"Error in texttospeech: {e}")
         return None
+# give the reason of recommendation
+async def associate(focus_info,openai_api_key,language,autoplay,length,log_state,sort_score,evt: gr.SelectData,narritive=None):
     rec_path=evt._data['value']['image']['path']
+    index=evt.index
     print("rec_path",rec_path)
     prompt="""
     'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
     """
     prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
     result=get_gpt_response(openai_api_key, rec_path, prompt)
+    print("recommend result",result)
+    reason = [(None, f"{result}")]
+    log_state = log_state + [(narritive, None)]
+    log_state = log_state + [(f"image sort ranking {sort_score}", None)]
+    log_state = log_state + [(None, f"{result}")]
     read_info = re.sub(r'[#[\]!*]','',result)
     read_info = emoji.replace_emoji(read_info,replace="")
     print("associate",read_info)
     if autoplay:
+        audio_output = await texttospeech(read_info, language)
+        return reason,audio_output,log_state,index
+    return reason,None,log_state,index
+def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
+    if task_type=="Session 1":
+        return None, [], [], [[], [], []], "", None, []
+    else:
+        if language=="English":
+            if narritive=="Third-person" :
+                state += [
+                (
+                    None,
+                    f"🤖 Hi, I am EyeSee. Let's explore this painting together."
+                )
+                ]
+            elif narritive=="Single-Persona: Artist":
+                state += [
+                (
+                    None,
+                    f"🧑‍🎨 Let's delve into it from the perspective of the artist."
+                )
+                ]
+            elif narritive=="Multi-Persona: Objects":
+                state += [
+                (
+                    None,
+                    f"🎨 Let's delve into it from the perspective of the objects depicted in the scene."
+                )
+                ]
+        elif language=="Chinese":
+            if narritive=="Third-person" :
+                state += [
+                    (
+                        None,
+                        "🤖 让我们从第三方视角一起探索这幅画吧。"
+                    )
+                ]
+            elif narritive == "Single-Persona: Artist":
+                state += [
+                (
+                    None,
+                    "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
+                )
+            ]
+            elif narritive == "Multi-Persona: Objects":
+                state += [
+                (
+                    None,
+                    "🎨 让我们从画面中事物的视角深入探索这幅画。"
+                )
+            ]
+        return image_input, state, state, click_state, paragraph, origin_image
+def print_like_dislike(x: gr.LikeData,state,log_state):
     print(x.index, x.value, x.liked)
     if x.liked == True:
         print("liked")
+        log_state=log_state+[(f"User liked this message", None)]
         state = state + [(None, f"Liked Received 👍")]
     else:
+        log_state=log_state+[(f"User disliked this message", None)]
         state = state + [(None, f"Disliked Received 👎")]
+    log_state+=[("%% user interaction %%", None)]
+    return log_state,state
+def get_recommendationscore(index,score,log_state):
+    log_state+=[(f"Picture {index} : {score}",None)]
+    log_state+=[("%% recommendation %%",None)]
+    return log_state
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
+        ["test_images/1.The Ambassadors.jpg"],
+        ["test_images/2.Football Players.jpg"],
+        ["test_images/3.Along the River during the Qingming Festival.jpeg"],
+        # ["test_images/test3.jpg"],
+        # ["test_images/test4.jpg"],
+        # ["test_images/test5.jpg"],
+        # ["test_images/Picture5.png"],
     ]
             css=css,
             theme=gr.themes.Base()
     ) as iface:
+        #display in the chatbox
         state = gr.State([])
+        # expoer in log
+        log_state=gr.State([])
+        # history log for gpt
+        history_log=gr.State([])
         out_state = gr.State(None)
         click_state = gr.State([[], [], []])
         origin_image = gr.State(None)
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
+        #store the selected image
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
         artist=gr.State(None)
         gr.Markdown(title)
         gr.Markdown(description)
         point_prompt = gr.State("Positive")
         log_list=gr.State([])
         gender=gr.State('female')
+        # store the whole image path
         image_path=gr.State('')
+        pic_index=gr.State(None)
+        with gr.Row():
+            auto_play = gr.Checkbox(
+            label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
+        )
+            output_audio = gr.HTML(
+                label="Synthesised Audio", elem_classes="custom-output"
+            )
         with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
+                    with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
                             year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
+                    with gr.Tab("Base2",visible=False) as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
                                     artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
                                     year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
+                                with gr.Row():
+                                    gr.Examples(
+                                examples=examples,
+                                inputs=[example_image],
+                            )
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
                                 add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
                                 minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
+                                focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
                                 focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
                                 recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
                 examples=examples,
                 inputs=[example_image],
             )
                         type="password")
                     with gr.Row():
                         enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
+                        # disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
+                        #                                 variant='primary')
                 with gr.Column(visible=False) as module_notification_box:
                     notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
+                # with gr.Column() as modules_need_gpt0:
+                #     with gr.Column(visible=False) as modules_need_gpt2:
+                #         paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
+                #         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
+                with gr.Column(visible=False) as modules_not_need_gpt2:
+                    with gr.Row():
+                        naritive = gr.Radio(
+                        choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
+                        value="Third-person",
+                        label="Persona",
+                        scale=5,
+                        interactive=True)
                     with gr.Blocks():
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
+                        with gr.Column() as modules_need_gpt3:
                             chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Chat", interactive=True)
                                 # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
                                 # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
                                 # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
                 # TTS interface hidden initially
                 with gr.Row():
                     submit_tts = gr.Button(value="Submit", interactive=True)
                     clear_tts = gr.Button(value="Clear", interactive=True)
+        with gr.Row():
+            with gr.Column(scale=6):
+                with gr.Column(visible=False) as recommend:
+                    gallery_result = gr.Gallery(
+                    label="Recommendations",
+                    height="auto",
+                    columns=4
+                    # columns=4,
+                    # rows=2,
+                    # show_label=False,
+                    # allow_preview=True,
+                    # object_fit="contain",
+                    # height="auto",
+                    # preview=True,
+                    # show_share_button=True,
+                    # show_download_button=True
+                    )
+                    sort_rec=gr.Dropdown(["1", "2", "3", "4"],
+                                value=[],
+                                multiselect=True,
+                                    label="Score", info="Please sort the pictures according to your preference"
+                    )
+            with gr.Column(scale=4,visible=False) as reco_reasons:
+                recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
+                recommend_score = gr.Radio(
+                            choices=[0,1,2,3,4,5],
+                            label="Score",
+                            interactive=True)
         ###############################################################################
         ############# this part is for text to image #############
         ###############################################################################
         with gr.Row(variant="panel",visible=False) as text2image_model:
             with gr.Column():
             #         # show_download_button=True
             #     )
+        with gr.Row(visible=False) as export:
             chat_log_file = gr.File(label="Download Chat Log",scale=5)
+        with gr.Row(elem_id="top_row",visible=False) as top_row:
+            task_type = gr.Dropdown(
+            ["Session 1","Session 2"],
+            value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
+        )
             language = gr.Dropdown(
             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
             value="English", label="Language", interactive=True, elem_classes="custom-language"
                                 interactive=True,
                                 label="Generated Caption Length",
                             )
+            # auto_play = gr.Checkbox(
+            # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
+        # )
+        #     output_audio = gr.HTML(
+        #         label="Synthesised Audio", elem_classes="custom-output"
+        #     )
         # )
         recommend_btn.click(
             fn=infer,
+            inputs=[new_crop_save_path,image_path],
             outputs=[gallery_result]
             )
         gallery_result.select(
             associate,
+            inputs=[paragraph,openai_api_key,language,auto_play,length,log_state,sort_rec],
+            outputs=[recommend_bot,output_audio,log_state,pic_index],
         )
         # mv_images = gr.State()
+        chatbot.like(print_like_dislike, inputs=[state,log_state], outputs=[log_state,chatbot])
         # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
         #     fn=generate_mvs,
         #     queue=False,
         #     show_progress=False
         # )
+        recommend_score.select(
+            get_recommendationscore,
+            inputs=[pic_index,recommend_score,log_state],
+            outputs=[log_state],
+        )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
+                              outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
+                                    outputs=[export,modules_need_gpt1, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
+        # disable_chatGPT_button.click(init_wo_openai_api_key,
+        #                              outputs=[export,modules_need_gpt1, modules_need_gpt3,
+        #                                       modules_not_need_gpt,
+        #                                       modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
+        # artist_label_base2.click(
+        #     get_artistinfo,
+        #     inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
+        #     outputs=[chatbot,state,output_audio]
+        # )
         artist_label.click(
             get_artistinfo,
+            inputs=[artist_label,openai_api_key,state,language,auto_play,length,log_state],
+            outputs=[chatbot,state,output_audio,log_state]
         )
+        # artist_label_traj.click(
+        #     get_artistinfo,
+        #     inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
+        #     outputs=[chatbot,state,output_audio]
+        # )
+        # year_label_base2.click(
+        #     get_yearinfo,
+        #     inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
+        #     outputs=[chatbot,state,output_audio]
+        # )
         year_label.click(
             get_yearinfo,
+            inputs=[year_label,openai_api_key,state,language,auto_play,length,log_state],
+            outputs=[chatbot,state,output_audio,log_state]
         )
+        # year_label_traj.click(
+        #     get_yearinfo,
+        #     inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
+        #     outputs=[chatbot,state,output_audio]
+        # )
+        # enable_chatGPT_button.click(
+        #     lambda: (None, [], [], [[], [], []], "", "", ""),
+        #     [],
+        #     [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+        #     queue=False,
+        #     show_progress=False
+        # )
+        # openai_api_key.submit(
+        #     lambda: (None, [], [], [[], [], []], "", "", ""),
+        #     [],
+        #     [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+        #     queue=False,
+        #     show_progress=False
+        # )
+        # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
+        #                             [paragraph_output,output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
             show_progress=False
         )
         clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
+        # clear_button_image.click(
+        #     lambda: (None, [], [], [[], [], []], "", "", ""),
+        #     [],
+        #     [image_input, chatbot, state, click_state, paragraph, origin_image],
+        #     queue=False,
+        #     show_progress=False
+        # )
+        # clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
         clear_button_text.click(
+            lambda: ([], [], [[], [], [], []],[]),
             [],
+            [chatbot, state, click_state,history_log],
             queue=False,
             show_progress=False
         )
         clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
         image_input.clear(
+            lambda: (None, [], [], [[], [], []], "", None, []),
             [],
+            [image_input, chatbot, state, click_state, paragraph, origin_image,history_log],
             queue=False,
             show_progress=False
         )
         image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
+        # image_input.change(
+        #     lambda: ([], [], [[], [], []], [], []),
+        #     [],
+        #     [chatbot, state, click_state, history_log, log_state],
+        #     queue=False,
+        #     show_progress=False
+        # )
+        # image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
+        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+        #                         paragraph,artist,gender,image_path])
+        # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
+        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+        #                         paragraph,artist,gender,image_path])
+        image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                                paragraph,artist,gender,image_path,log_state,history_log])
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
         #                       [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
+        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log],
+                          [chatbot, state, aux_state,output_audio,log_state,history_log])
         # chat_input.submit(lambda: "", None, chat_input)
         chat_input.submit(lambda: {"text": ""}, None, chat_input)
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
+        example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                            paragraph,artist,gender,image_path, log_state,history_log])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
+        # def on_click_tab_selected():
+        #     if gpt_state ==1:
+        #         print(gpt_state)
+        #         print("using gpt")
+        #         return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
+        #     else:
+        #         print("no gpt")
+        #         print("gpt_state",gpt_state)
+        #         return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
+        # def on_base_selected():
+        #     if gpt_state ==1:
+        #         print(gpt_state)
+        #         print("using gpt")
+        #         return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
+        #     else:
+        #         print("no gpt")
+        #         return [gr.update(visible=False)]*4
+        # traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
+        # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
+        # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
+        # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
             inputs=[
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
+                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
             ],
             outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
             submit_caption,
             inputs=[
         naritive, state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender,log_state,history_log
     ],
             outputs=[
+                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
             ],
             show_progress=True,
             queue=True
         submit_caption,
         inputs=[
         naritive,state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path,gender,log_state,
+        history_log
         ],
         outputs=[
+                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
+            ],
         show_progress=True,
         queue=True
         )
         inputs=[
         naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path,gender,log_state,history_log
         ],
         outputs=[
+            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
         ],
         show_progress=True,
         queue=True
         inputs=[
         naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path,gender,log_state,history_log
         ],
         outputs=[
+            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
         ],
         show_progress=True,
         queue=True
         export_button.click(
             export_chat_log,
+            inputs=[log_state,log_list,naritive],
             outputs=[chat_log_file,log_list],
             queue=True
         )
         naritive.change(
+            change_naritive,
+            [task_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
+            [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
             queue=False,
             show_progress=False
         )
+        task_type.change(
+            lambda: ([]),
+            [],
+            [log_state]
+        )
         # upvote_btn.click(
         #     handle_liked,
         #     inputs=[state,like_res],