Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from io import BytesIO
|
2 |
import io
|
3 |
from math import inf
|
@@ -346,20 +347,33 @@ def extract_features_siglip(image):
|
|
346 |
return image_features
|
347 |
|
348 |
@spaces.GPU
|
349 |
-
def infer(
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
|
365 |
###############################################################################
|
@@ -547,28 +561,14 @@ filtered_language_dict = {
|
|
547 |
'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
|
548 |
}
|
549 |
|
550 |
-
focus_map = {
|
551 |
-
"
|
552 |
-
"
|
553 |
-
"
|
554 |
"Judge":3
|
555 |
}
|
556 |
|
557 |
-
'''
|
558 |
-
prompt_list = [
|
559 |
-
'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
560 |
-
'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
|
561 |
-
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
|
562 |
-
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
|
563 |
-
]
|
564 |
|
565 |
-
prompt_list = [
|
566 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
567 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
568 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
569 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
|
570 |
-
]
|
571 |
-
'''
|
572 |
prompt_list = [
|
573 |
[
|
574 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
@@ -722,11 +722,11 @@ def init_openai_api_key(api_key=""):
|
|
722 |
global gpt_state
|
723 |
gpt_state=1
|
724 |
# return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
725 |
-
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+
|
726 |
else:
|
727 |
gpt_state=0
|
728 |
# return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
729 |
-
return [gr.update(visible=False)]*
|
730 |
|
731 |
def init_wo_openai_api_key():
|
732 |
global gpt_state
|
@@ -734,6 +734,7 @@ def init_wo_openai_api_key():
|
|
734 |
# return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
735 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
|
736 |
|
|
|
737 |
def get_click_prompt(chat_input, click_state, click_mode):
|
738 |
inputs = json.loads(chat_input)
|
739 |
if click_mode == 'Continuous':
|
@@ -771,35 +772,38 @@ def update_click_state(click_state, caption, click_mode):
|
|
771 |
raise NotImplementedError
|
772 |
|
773 |
async def chat_input_callback(*args):
|
774 |
-
visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input = args
|
775 |
message = chat_input["text"]
|
776 |
prompt="Please help me answer the question with this painting {question} in {language}."
|
777 |
prompt=prompt.format(question=message, language=language)
|
778 |
-
|
779 |
if visual_chatgpt is not None:
|
780 |
-
result=get_gpt_response(api_key, image_input,prompt+message)
|
781 |
-
state = state + [(None, result)]
|
782 |
read_info = re.sub(r'[#[\]!*]','',result)
|
783 |
read_info = emoji.replace_emoji(read_info,replace="")
|
784 |
-
|
785 |
-
|
786 |
-
|
|
|
|
|
|
|
|
|
787 |
if autoplay==False:
|
788 |
-
return state, state, aux_state, None
|
789 |
|
790 |
else:
|
791 |
-
audio = await texttospeech(read_info,language,
|
792 |
-
return state, state, aux_state, audio
|
793 |
else:
|
794 |
response = "Text refiner is not initilzed, please input openai api key."
|
795 |
state = state + [(chat_input, response)]
|
796 |
-
audio = await texttospeech(response,language,
|
797 |
-
return state, state, None, audio
|
798 |
-
|
799 |
|
800 |
|
801 |
-
def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
|
802 |
print("narritive", narritive)
|
|
|
803 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
804 |
image_input = image_input['background']
|
805 |
|
@@ -810,6 +814,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
810 |
|
811 |
|
812 |
click_state = [[], [], []]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
image_input = image_resize(image_input, res=1024)
|
814 |
|
815 |
model = build_caption_anything_with_models(
|
@@ -831,18 +850,14 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
831 |
image_input.save(new_image_path)
|
832 |
visual_chatgpt.current_image = new_image_path
|
833 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
834 |
-
# img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
|
835 |
-
Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
836 |
-
AI_prompt = "Received."
|
837 |
-
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
838 |
-
visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
839 |
-
print("memory",visual_chatgpt.agent.memory)
|
840 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
841 |
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
|
|
|
842 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
843 |
name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
|
844 |
gender=gender.lower()
|
845 |
print("gender",gender)
|
|
|
846 |
|
847 |
|
848 |
if language=="English":
|
@@ -888,13 +903,21 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
888 |
None,
|
889 |
f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
|
890 |
)
|
891 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
892 |
|
893 |
-
|
894 |
-
|
895 |
|
896 |
return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
|
897 |
-
original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path]
|
898 |
|
899 |
|
900 |
|
@@ -933,20 +956,23 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
933 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
934 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
935 |
# state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
|
936 |
-
|
937 |
-
# state = state + [("Selected image point: {}, Input label: {}".format(
|
938 |
-
# prompt["input_point"],
|
939 |
-
# '+' if prompt["input_label"] == "1" else '-'
|
940 |
-
# ), None)]
|
941 |
|
942 |
-
|
943 |
|
944 |
-
|
945 |
-
|
946 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
947 |
|
948 |
-
|
949 |
-
|
950 |
input_mask = np.array(out['mask'].convert('P'))
|
951 |
image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
|
952 |
|
@@ -957,62 +983,75 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
957 |
out_state = out
|
958 |
|
959 |
if visual_chatgpt is not None:
|
960 |
-
print('inference_click: add caption to chatGPT memory')
|
961 |
new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
|
962 |
Image.open(out["crop_save_path"]).save(new_crop_save_path)
|
963 |
-
|
964 |
-
visual_chatgpt.point_prompt = point_prompt
|
965 |
-
|
966 |
-
|
967 |
-
print("new crop save",new_crop_save_path)
|
968 |
|
969 |
yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
970 |
|
971 |
|
972 |
-
|
973 |
-
"
|
974 |
-
"
|
975 |
-
"
|
976 |
-
"
|
977 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
978 |
|
979 |
|
980 |
async def submit_caption(naritive, state,length, sentiment, factuality, language,
|
981 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
982 |
-
autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender):
|
983 |
|
984 |
|
985 |
-
|
986 |
-
|
987 |
click_index = click_index_state
|
988 |
|
989 |
-
# if pre_click_index==click_index:
|
990 |
-
# click_index = (click_index[0] - 1, click_index[1] - 1)
|
991 |
-
# pre_click_index = click_index
|
992 |
-
# else:
|
993 |
-
# pre_click_index = click_index
|
994 |
print("click_index",click_index)
|
995 |
print("input_points_state",input_points_state)
|
996 |
print("input_labels_state",input_labels_state)
|
997 |
|
998 |
prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
|
999 |
|
|
|
|
|
|
|
1000 |
print("Prompt:", prompt)
|
1001 |
print("click",click_index)
|
1002 |
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
|
1007 |
# if not args.disable_gpt and text_refiner:
|
1008 |
if not args.disable_gpt:
|
1009 |
print("new crop save",new_crop_save_path)
|
1010 |
-
focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
|
1011 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
1012 |
focus_info=focus_info[1:-1]
|
1013 |
focus_info=focus_info.replace('#', '')
|
1014 |
# state = state + [(None, f"Wiki: {paragraph}")]
|
1015 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1016 |
print("new_cap",focus_info)
|
1017 |
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
1018 |
read_info = emoji.replace_emoji(read_info,replace="")
|
@@ -1028,27 +1067,25 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
|
|
1028 |
print("error gpt responese")
|
1029 |
print("item gender",gender)
|
1030 |
|
1031 |
-
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1032 |
-
# input_points=input_points, input_labels=input_labels)
|
1033 |
try:
|
1034 |
if autoplay==False:
|
1035 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
|
1036 |
|
1037 |
-
audio_output = await texttospeech(read_info, language,
|
1038 |
print("done")
|
1039 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1040 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
1041 |
|
1042 |
except Exception as e:
|
1043 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
1044 |
print(f"Error during TTS prediction: {str(e)}")
|
1045 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
1046 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
1047 |
|
1048 |
else:
|
1049 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
1050 |
print(f"Error during TTS prediction: {str(e)}")
|
1051 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None
|
1052 |
|
1053 |
|
1054 |
|
@@ -1088,70 +1125,57 @@ def encode_image(image_path):
|
|
1088 |
with open(image_path, "rb") as image_file:
|
1089 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
1090 |
|
1091 |
-
def get_gpt_response(api_key, image_path, prompt,
|
1092 |
-
headers = {
|
1093 |
-
"Content-Type": "application/json",
|
1094 |
-
"Authorization": f"Bearer {api_key}"
|
1095 |
-
}
|
1096 |
|
1097 |
headers = {
|
1098 |
"Content-Type": "application/json",
|
1099 |
"Authorization": f"Bearer {api_key}"
|
1100 |
}
|
1101 |
-
base64_images=[]
|
1102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1103 |
if image_path:
|
1104 |
-
|
1105 |
-
|
1106 |
-
|
1107 |
-
|
1108 |
-
base64_images.append(base64_image)
|
1109 |
-
else:
|
1110 |
-
base64_image = encode_image(image_path)
|
1111 |
-
base64_images.append(base64_image)
|
1112 |
-
|
1113 |
-
payload = {
|
1114 |
-
"model": "gpt-4o",
|
1115 |
-
"messages": [
|
1116 |
{
|
1117 |
-
"
|
1118 |
-
"
|
1119 |
-
|
1120 |
-
|
1121 |
-
|
1122 |
-
|
1123 |
-
{
|
1124 |
-
"type": "image_url",
|
1125 |
-
"image_url": {
|
1126 |
-
"url": f"data:image/jpeg;base64,{base64_images}"
|
1127 |
-
}
|
1128 |
-
}
|
1129 |
-
]
|
1130 |
-
}
|
1131 |
-
],
|
1132 |
-
"max_tokens": 300
|
1133 |
-
}
|
1134 |
-
else:
|
1135 |
-
payload = {
|
1136 |
-
"model": "gpt-4o",
|
1137 |
-
"messages": [
|
1138 |
-
{
|
1139 |
-
"role": "user",
|
1140 |
-
"content": [
|
1141 |
-
{
|
1142 |
-
"type": "text",
|
1143 |
-
"text": prompt
|
1144 |
}
|
1145 |
-
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1150 |
|
1151 |
# Sending the request to the OpenAI API
|
1152 |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
1153 |
result = response.json()
|
1154 |
-
|
1155 |
try:
|
1156 |
content = result['choices'][0]['message']['content']
|
1157 |
return content
|
@@ -1295,61 +1319,62 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
|
|
1295 |
visual_chatgpt.global_prompt = ""
|
1296 |
|
1297 |
|
1298 |
-
def export_chat_log(chat_state,
|
1299 |
try:
|
|
|
1300 |
if not chat_state:
|
1301 |
return None
|
1302 |
-
chat_log = f"Image Description: {paragraph}\n\n"
|
1303 |
for entry in chat_state:
|
1304 |
user_message, bot_response = entry
|
1305 |
if user_message and bot_response:
|
1306 |
chat_log += f"User: {user_message}\nBot: {bot_response}\n"
|
|
|
|
|
1307 |
elif user_message:
|
1308 |
chat_log += f"User: {user_message}\n"
|
|
|
1309 |
elif bot_response:
|
1310 |
chat_log += f"Bot: {bot_response}\n"
|
|
|
1311 |
|
1312 |
-
|
1313 |
-
|
1314 |
-
|
1315 |
-
|
1316 |
|
1317 |
-
|
1318 |
-
|
1319 |
-
chat_log += f"{response}\n"
|
1320 |
|
1321 |
-
print(
|
1322 |
-
|
1323 |
-
|
1324 |
-
temp_file.write(chat_log.encode('utf-8'))
|
1325 |
-
temp_file_path = temp_file.name
|
1326 |
-
print(temp_file_path)
|
1327 |
-
log_list.append(temp_file_path)
|
1328 |
return log_list,log_list
|
1329 |
except Exception as e:
|
1330 |
print(f"An error occurred while exporting the chat log: {e}")
|
1331 |
-
return None
|
1332 |
|
1333 |
-
async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
|
1334 |
prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
|
1335 |
res=get_gpt_response(api_key,None,prompt)
|
1336 |
state = state + [(None, res)]
|
1337 |
read_info = re.sub(r'[#[\]!*]','',res)
|
1338 |
read_info = emoji.replace_emoji(read_info,replace="")
|
|
|
1339 |
|
1340 |
|
1341 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1342 |
# input_points=input_points, input_labels=input_labels)
|
1343 |
if autoplay:
|
1344 |
-
audio_output = await texttospeech(read_info, language
|
1345 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1346 |
-
return state, state,audio_output
|
1347 |
-
return state, state,None
|
1348 |
|
1349 |
|
1350 |
-
async def get_yearinfo(year,api_key,state,language,autoplay,length):
|
1351 |
prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
|
1352 |
res=get_gpt_response(api_key,None,prompt)
|
|
|
1353 |
state = state + [(None, res)]
|
1354 |
read_info = re.sub(r'[#[\]!*]','',res)
|
1355 |
read_info = emoji.replace_emoji(read_info,replace="")
|
@@ -1358,47 +1383,47 @@ async def get_yearinfo(year,api_key,state,language,autoplay,length):
|
|
1358 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1359 |
# input_points=input_points, input_labels=input_labels)
|
1360 |
if autoplay:
|
1361 |
-
audio_output = await texttospeech(read_info, language
|
1362 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1363 |
-
return state, state,audio_output
|
1364 |
-
return state, state,None
|
1365 |
|
1366 |
|
1367 |
|
1368 |
|
1369 |
|
1370 |
-
async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
|
1371 |
|
1372 |
-
|
1373 |
-
|
1374 |
-
|
1375 |
-
|
1376 |
|
1377 |
-
|
1378 |
-
|
1379 |
-
|
1380 |
-
|
1381 |
-
|
1382 |
|
1383 |
-
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
1384 |
|
1385 |
-
|
1386 |
-
|
1387 |
-
|
1388 |
-
|
1389 |
-
|
1390 |
-
|
1391 |
-
|
1392 |
-
|
1393 |
-
|
1394 |
-
|
1395 |
-
|
1396 |
-
|
1397 |
-
|
1398 |
-
|
1399 |
-
|
1400 |
-
|
1401 |
-
|
1402 |
|
1403 |
|
1404 |
|
@@ -1490,62 +1515,119 @@ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragr
|
|
1490 |
|
1491 |
# return like_state, dislike_state
|
1492 |
|
1493 |
-
async def texttospeech(text, language,
|
1494 |
try:
|
1495 |
-
|
1496 |
-
|
1497 |
-
|
1498 |
-
|
1499 |
-
|
1500 |
-
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
else:
|
1507 |
-
audio_player = None
|
1508 |
-
print("Autoplay is disabled.")
|
1509 |
return audio_player
|
|
|
1510 |
except Exception as e:
|
1511 |
print(f"Error in texttospeech: {e}")
|
1512 |
return None
|
1513 |
|
1514 |
-
|
|
|
1515 |
rec_path=evt._data['value']['image']['path']
|
|
|
1516 |
print("rec_path",rec_path)
|
1517 |
prompt="""
|
1518 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
|
1519 |
"""
|
1520 |
prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
|
1521 |
result=get_gpt_response(openai_api_key, rec_path, prompt)
|
1522 |
-
|
|
|
|
|
|
|
|
|
1523 |
read_info = re.sub(r'[#[\]!*]','',result)
|
1524 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1525 |
print("associate",read_info)
|
1526 |
if autoplay:
|
1527 |
-
audio_output = await texttospeech(read_info, language
|
1528 |
-
return
|
1529 |
-
return
|
1530 |
-
|
1531 |
-
|
1532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1533 |
|
|
|
1534 |
|
1535 |
|
1536 |
-
def print_like_dislike(x: gr.LikeData,
|
1537 |
print(x.index, x.value, x.liked)
|
1538 |
if x.liked == True:
|
1539 |
print("liked")
|
1540 |
-
|
1541 |
-
print(like_res)
|
1542 |
state = state + [(None, f"Liked Received 👍")]
|
1543 |
else:
|
1544 |
-
|
1545 |
state = state + [(None, f"Disliked Received 👎")]
|
1546 |
-
|
1547 |
-
|
1548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1549 |
|
1550 |
def toggle_icons_and_update_prompt(point_prompt):
|
1551 |
new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
|
@@ -1568,13 +1650,13 @@ def create_ui():
|
|
1568 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1569 |
|
1570 |
examples = [
|
1571 |
-
["test_images/
|
1572 |
-
["test_images/
|
1573 |
-
["test_images/
|
1574 |
-
["test_images/test3.jpg"],
|
1575 |
-
["test_images/test4.jpg"],
|
1576 |
-
["test_images/test5.jpg"],
|
1577 |
-
["test_images/Picture5.png"],
|
1578 |
|
1579 |
]
|
1580 |
|
@@ -1582,7 +1664,13 @@ def create_ui():
|
|
1582 |
css=css,
|
1583 |
theme=gr.themes.Base()
|
1584 |
) as iface:
|
|
|
1585 |
state = gr.State([])
|
|
|
|
|
|
|
|
|
|
|
1586 |
out_state = gr.State(None)
|
1587 |
click_state = gr.State([[], [], []])
|
1588 |
origin_image = gr.State(None)
|
@@ -1597,49 +1685,32 @@ def create_ui():
|
|
1597 |
input_mask_state = gr.State(np.zeros((1, 1)))
|
1598 |
input_points_state = gr.State([])
|
1599 |
input_labels_state = gr.State([])
|
|
|
1600 |
new_crop_save_path = gr.State(None)
|
1601 |
image_input_nobackground = gr.State(None)
|
1602 |
artist=gr.State(None)
|
1603 |
-
like_res=gr.State([])
|
1604 |
-
dislike_res=gr.State([])
|
1605 |
gr.Markdown(title)
|
1606 |
gr.Markdown(description)
|
1607 |
point_prompt = gr.State("Positive")
|
1608 |
log_list=gr.State([])
|
1609 |
gender=gr.State('female')
|
|
|
1610 |
image_path=gr.State('')
|
1611 |
-
|
1612 |
-
|
1613 |
-
|
1614 |
-
|
1615 |
-
|
1616 |
-
|
1617 |
-
|
1618 |
-
|
1619 |
-
|
1620 |
-
|
1621 |
-
# with gr.Row(align="right",visible=False) as autoplay:
|
1622 |
-
# auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1623 |
-
# output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1624 |
-
|
1625 |
-
|
1626 |
-
|
1627 |
-
|
1628 |
-
|
1629 |
-
|
1630 |
-
# with gr.Row(align="right",visible=False) as language_select:
|
1631 |
-
# language = gr.Dropdown(
|
1632 |
-
# ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1633 |
-
# value="English", label="Language", interactive=True)
|
1634 |
-
|
1635 |
-
# with gr.Row(align="right",visible=False) as autoplay:
|
1636 |
-
# auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1637 |
-
# output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1638 |
-
|
1639 |
with gr.Row():
|
1640 |
with gr.Column(scale=6):
|
1641 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1642 |
-
|
|
|
1643 |
image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1644 |
with gr.Row():
|
1645 |
name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
|
@@ -1647,7 +1718,7 @@ def create_ui():
|
|
1647 |
year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1648 |
material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
|
1649 |
|
1650 |
-
with gr.Tab("Base2") as base_tab2:
|
1651 |
image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1652 |
with gr.Row():
|
1653 |
name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
|
@@ -1665,6 +1736,12 @@ def create_ui():
|
|
1665 |
artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
|
1666 |
year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1667 |
material_label = gr.Button(value="Style: ",elem_classes="info_btn")
|
|
|
|
|
|
|
|
|
|
|
|
|
1668 |
|
1669 |
|
1670 |
# example_image_click = gr.Image(type="pil", interactive=False, visible=False)
|
@@ -1673,10 +1750,9 @@ def create_ui():
|
|
1673 |
add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
|
1674 |
minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
|
1675 |
clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
|
1676 |
-
|
1677 |
-
|
1678 |
-
|
1679 |
-
focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
|
1680 |
focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
|
1681 |
|
1682 |
recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
|
@@ -1743,28 +1819,13 @@ def create_ui():
|
|
1743 |
value="No",
|
1744 |
label="Expert",
|
1745 |
interactive=True)
|
1746 |
-
|
1747 |
-
with gr.Column(visible=False) as recommend:
|
1748 |
-
gallery_result = gr.Gallery(
|
1749 |
-
label="Result",
|
1750 |
-
height="auto",
|
1751 |
-
columns=4
|
1752 |
-
# columns=4,
|
1753 |
-
# rows=2,
|
1754 |
-
# show_label=False,
|
1755 |
-
# allow_preview=True,
|
1756 |
-
# object_fit="contain",
|
1757 |
-
# height="auto",
|
1758 |
-
# preview=True,
|
1759 |
-
# show_share_button=True,
|
1760 |
-
# show_download_button=True
|
1761 |
-
)
|
1762 |
-
|
1763 |
with gr.Column(visible=True) as modules_not_need_gpt3:
|
1764 |
gr.Examples(
|
1765 |
examples=examples,
|
1766 |
inputs=[example_image],
|
1767 |
)
|
|
|
|
|
1768 |
|
1769 |
|
1770 |
|
@@ -1780,20 +1841,29 @@ def create_ui():
|
|
1780 |
type="password")
|
1781 |
with gr.Row():
|
1782 |
enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
|
1783 |
-
disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
|
1784 |
-
|
1785 |
with gr.Column(visible=False) as module_notification_box:
|
1786 |
notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
|
1787 |
|
1788 |
-
with gr.Column() as modules_need_gpt0:
|
1789 |
-
|
1790 |
-
|
1791 |
-
|
1792 |
|
1793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1794 |
with gr.Blocks():
|
1795 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
|
1796 |
-
with gr.Column(
|
1797 |
chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
|
1798 |
with gr.Row():
|
1799 |
clear_button_text = gr.Button(value="Clear Chat", interactive=True)
|
@@ -1801,13 +1871,9 @@ def create_ui():
|
|
1801 |
# submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
|
1802 |
# upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
|
1803 |
# downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
|
1804 |
-
|
1805 |
-
|
1806 |
-
|
1807 |
-
value="Third-person",
|
1808 |
-
label="narritive",
|
1809 |
-
scale=5,
|
1810 |
-
interactive=True)
|
1811 |
|
1812 |
|
1813 |
# TTS interface hidden initially
|
@@ -1824,10 +1890,46 @@ def create_ui():
|
|
1824 |
with gr.Row():
|
1825 |
submit_tts = gr.Button(value="Submit", interactive=True)
|
1826 |
clear_tts = gr.Button(value="Clear", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1827 |
###############################################################################
|
1828 |
############# this part is for text to image #############
|
1829 |
###############################################################################
|
1830 |
-
|
1831 |
with gr.Row(variant="panel",visible=False) as text2image_model:
|
1832 |
|
1833 |
with gr.Column():
|
@@ -1922,10 +2024,14 @@ def create_ui():
|
|
1922 |
# # show_download_button=True
|
1923 |
# )
|
1924 |
|
1925 |
-
with gr.Row():
|
1926 |
-
|
1927 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
1928 |
-
|
|
|
|
|
|
|
|
|
|
|
1929 |
language = gr.Dropdown(
|
1930 |
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1931 |
value="English", label="Language", interactive=True, elem_classes="custom-language"
|
@@ -1938,12 +2044,12 @@ def create_ui():
|
|
1938 |
interactive=True,
|
1939 |
label="Generated Caption Length",
|
1940 |
)
|
1941 |
-
auto_play = gr.Checkbox(
|
1942 |
-
label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
|
1943 |
-
)
|
1944 |
-
|
1945 |
-
|
1946 |
-
|
1947 |
|
1948 |
|
1949 |
|
@@ -1984,14 +2090,14 @@ def create_ui():
|
|
1984 |
# )
|
1985 |
recommend_btn.click(
|
1986 |
fn=infer,
|
1987 |
-
inputs=[new_crop_save_path],
|
1988 |
outputs=[gallery_result]
|
1989 |
)
|
1990 |
|
1991 |
gallery_result.select(
|
1992 |
associate,
|
1993 |
-
inputs=[paragraph,openai_api_key,language,
|
1994 |
-
outputs=[
|
1995 |
|
1996 |
|
1997 |
)
|
@@ -2093,7 +2199,7 @@ def create_ui():
|
|
2093 |
|
2094 |
# mv_images = gr.State()
|
2095 |
|
2096 |
-
|
2097 |
|
2098 |
# submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
2099 |
# fn=generate_mvs,
|
@@ -2138,81 +2244,80 @@ def create_ui():
|
|
2138 |
# queue=False,
|
2139 |
# show_progress=False
|
2140 |
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
2141 |
|
2142 |
|
2143 |
|
2144 |
|
2145 |
|
2146 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
2147 |
-
outputs=[
|
2148 |
-
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
|
2149 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
2150 |
-
outputs=[
|
2151 |
modules_not_need_gpt,
|
2152 |
-
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
|
2153 |
-
|
2154 |
-
#
|
2155 |
-
#
|
2156 |
-
#
|
2157 |
-
#
|
2158 |
-
|
2159 |
-
#
|
2160 |
-
|
2161 |
-
|
2162 |
-
|
2163 |
-
|
2164 |
-
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
|
2165 |
-
|
2166 |
-
artist_label_base2.click(
|
2167 |
-
get_artistinfo,
|
2168 |
-
inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
|
2169 |
-
outputs=[chatbot,state,output_audio]
|
2170 |
-
)
|
2171 |
artist_label.click(
|
2172 |
get_artistinfo,
|
2173 |
-
inputs=[artist_label,openai_api_key,state,language,auto_play,length],
|
2174 |
-
outputs=[chatbot,state,output_audio]
|
2175 |
-
)
|
2176 |
-
artist_label_traj.click(
|
2177 |
-
get_artistinfo,
|
2178 |
-
inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
|
2179 |
-
outputs=[chatbot,state,output_audio]
|
2180 |
)
|
|
|
|
|
|
|
|
|
|
|
2181 |
|
2182 |
-
year_label_base2.click(
|
2183 |
-
|
2184 |
-
|
2185 |
-
|
2186 |
-
)
|
2187 |
year_label.click(
|
2188 |
get_yearinfo,
|
2189 |
-
inputs=[year_label,openai_api_key,state,language,auto_play,length],
|
2190 |
-
outputs=[chatbot,state,output_audio]
|
2191 |
-
)
|
2192 |
-
year_label_traj.click(
|
2193 |
-
get_yearinfo,
|
2194 |
-
inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
|
2195 |
-
outputs=[chatbot,state,output_audio]
|
2196 |
)
|
|
|
|
|
|
|
|
|
|
|
2197 |
|
2198 |
|
2199 |
-
enable_chatGPT_button.click(
|
2200 |
-
|
2201 |
-
|
2202 |
-
|
2203 |
-
|
2204 |
-
|
2205 |
-
)
|
2206 |
-
openai_api_key.submit(
|
2207 |
-
|
2208 |
-
|
2209 |
-
|
2210 |
-
|
2211 |
-
|
2212 |
-
)
|
2213 |
|
2214 |
-
cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
|
2215 |
-
|
2216 |
|
2217 |
clear_button_click.click(
|
2218 |
lambda x: ([[], [], []], x),
|
@@ -2222,53 +2327,61 @@ def create_ui():
|
|
2222 |
show_progress=False
|
2223 |
)
|
2224 |
clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
|
2225 |
-
clear_button_image.click(
|
2226 |
-
|
2227 |
-
|
2228 |
-
|
2229 |
-
|
2230 |
-
|
2231 |
-
)
|
2232 |
-
clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
|
2233 |
clear_button_text.click(
|
2234 |
-
lambda: ([], [], [[], [], [], []]),
|
2235 |
[],
|
2236 |
-
[chatbot, state, click_state],
|
2237 |
queue=False,
|
2238 |
show_progress=False
|
2239 |
)
|
2240 |
clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
|
2241 |
|
2242 |
image_input.clear(
|
2243 |
-
lambda: (None, [], [], [[], [], []], "",
|
2244 |
[],
|
2245 |
-
[image_input, chatbot, state, click_state,
|
2246 |
queue=False,
|
2247 |
show_progress=False
|
2248 |
)
|
2249 |
|
2250 |
image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2251 |
|
2252 |
|
2253 |
|
2254 |
|
2255 |
-
image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
|
2256 |
-
|
2257 |
-
|
2258 |
-
|
2259 |
-
|
2260 |
|
2261 |
-
image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
|
2262 |
-
|
2263 |
-
|
2264 |
-
|
2265 |
-
|
2266 |
|
2267 |
-
image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key,language,naritive],
|
2268 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2269 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2270 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2271 |
-
paragraph,artist,gender,image_path])
|
2272 |
|
2273 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
|
2274 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
@@ -2282,45 +2395,45 @@ def create_ui():
|
|
2282 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
|
2283 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
2284 |
# image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
|
2285 |
-
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path],
|
2286 |
-
[chatbot, state, aux_state,output_audio])
|
2287 |
# chat_input.submit(lambda: "", None, chat_input)
|
2288 |
chat_input.submit(lambda: {"text": ""}, None, chat_input)
|
2289 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2290 |
# [chatbot, state, aux_state,output_audio])
|
2291 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2292 |
-
example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key,language,naritive],
|
2293 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2294 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2295 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2296 |
-
paragraph,artist,gender,image_path])
|
2297 |
|
2298 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
2299 |
|
2300 |
-
def on_click_tab_selected():
|
2301 |
-
|
2302 |
-
|
2303 |
-
|
2304 |
-
|
2305 |
-
|
2306 |
-
|
2307 |
-
|
2308 |
-
|
2309 |
-
|
2310 |
-
def on_base_selected():
|
2311 |
-
|
2312 |
-
|
2313 |
-
|
2314 |
-
|
2315 |
-
|
2316 |
-
|
2317 |
-
|
2318 |
-
|
2319 |
-
|
2320 |
-
traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2321 |
-
click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2322 |
-
base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
|
2323 |
-
base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
|
2324 |
|
2325 |
|
2326 |
|
@@ -2330,7 +2443,7 @@ def create_ui():
|
|
2330 |
inputs=[
|
2331 |
origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
|
2332 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
2333 |
-
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
|
2334 |
],
|
2335 |
outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
|
2336 |
show_progress=False, queue=True
|
@@ -2341,10 +2454,10 @@ def create_ui():
|
|
2341 |
submit_caption,
|
2342 |
inputs=[
|
2343 |
naritive, state,length, sentiment, factuality, language,
|
2344 |
-
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
|
2345 |
],
|
2346 |
outputs=[
|
2347 |
-
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
|
2348 |
],
|
2349 |
show_progress=True,
|
2350 |
queue=True
|
@@ -2358,11 +2471,12 @@ def create_ui():
|
|
2358 |
submit_caption,
|
2359 |
inputs=[
|
2360 |
naritive,state,length, sentiment, factuality, language,
|
2361 |
-
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
|
|
|
2362 |
],
|
2363 |
outputs=[
|
2364 |
-
|
2365 |
-
|
2366 |
show_progress=True,
|
2367 |
queue=True
|
2368 |
)
|
@@ -2373,10 +2487,10 @@ def create_ui():
|
|
2373 |
inputs=[
|
2374 |
naritive,state,length, sentiment, factuality, language,
|
2375 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2376 |
-
auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
|
2377 |
],
|
2378 |
outputs=[
|
2379 |
-
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
|
2380 |
],
|
2381 |
show_progress=True,
|
2382 |
queue=True
|
@@ -2388,10 +2502,10 @@ def create_ui():
|
|
2388 |
inputs=[
|
2389 |
naritive,state,length, sentiment, factuality, language,
|
2390 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2391 |
-
auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
|
2392 |
],
|
2393 |
outputs=[
|
2394 |
-
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
|
2395 |
],
|
2396 |
show_progress=True,
|
2397 |
queue=True
|
@@ -2431,20 +2545,26 @@ def create_ui():
|
|
2431 |
|
2432 |
export_button.click(
|
2433 |
export_chat_log,
|
2434 |
-
inputs=[
|
2435 |
outputs=[chat_log_file,log_list],
|
2436 |
queue=True
|
2437 |
)
|
2438 |
|
2439 |
naritive.change(
|
2440 |
-
|
2441 |
-
[],
|
2442 |
-
[image_input, chatbot, state, click_state,
|
2443 |
queue=False,
|
2444 |
show_progress=False
|
2445 |
|
2446 |
)
|
2447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
2448 |
# upvote_btn.click(
|
2449 |
# handle_liked,
|
2450 |
# inputs=[state,like_res],
|
|
|
1 |
+
import datetime
|
2 |
from io import BytesIO
|
3 |
import io
|
4 |
from math import inf
|
|
|
347 |
return image_features
|
348 |
|
349 |
@spaces.GPU
|
350 |
+
def infer(crop_image_path,full_image_path):
|
351 |
+
input_image = Image.open(crop_image_path).convert("RGB")
|
352 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
353 |
+
input_features = input_features.detach().cpu().numpy()
|
354 |
+
input_features = np.float32(input_features)
|
355 |
+
faiss.normalize_L2(input_features)
|
356 |
+
distances, indices = index.search(input_features, 2)
|
357 |
+
gallery_output = []
|
358 |
+
for i,v in enumerate(indices[0]):
|
359 |
+
sim = -distances[0][i]
|
360 |
+
image_url = df.iloc[v]["Link"]
|
361 |
+
img_retrieved = read_image_from_url(image_url)
|
362 |
+
gallery_output.append(img_retrieved)
|
363 |
+
|
364 |
+
input_image = Image.open(full_image_path).convert("RGB")
|
365 |
+
input_features = extract_features_siglip(input_image.convert("RGB"))
|
366 |
+
input_features = input_features.detach().cpu().numpy()
|
367 |
+
input_features = np.float32(input_features)
|
368 |
+
faiss.normalize_L2(input_features)
|
369 |
+
distances, indices = index.search(input_features, 2)
|
370 |
+
for i,v in enumerate(indices[0]):
|
371 |
+
sim = -distances[0][i]
|
372 |
+
image_url = df.iloc[v]["Link"]
|
373 |
+
img_retrieved = read_image_from_url(image_url)
|
374 |
+
gallery_output.append(img_retrieved)
|
375 |
+
|
376 |
+
return gallery_output
|
377 |
|
378 |
|
379 |
###############################################################################
|
|
|
561 |
'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
|
562 |
}
|
563 |
|
564 |
+
focus_map = {
|
565 |
+
"Describe":0,
|
566 |
+
"D+Analysis":1,
|
567 |
+
"DA+Interprete":2,
|
568 |
"Judge":3
|
569 |
}
|
570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
prompt_list = [
|
573 |
[
|
574 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
|
|
722 |
global gpt_state
|
723 |
gpt_state=1
|
724 |
# return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
725 |
+
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]* 3 + [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3+[gr.update(visible=False)]
|
726 |
else:
|
727 |
gpt_state=0
|
728 |
# return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
729 |
+
return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*4
|
730 |
|
731 |
def init_wo_openai_api_key():
|
732 |
global gpt_state
|
|
|
734 |
# return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
735 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
|
736 |
|
737 |
+
|
738 |
def get_click_prompt(chat_input, click_state, click_mode):
|
739 |
inputs = json.loads(chat_input)
|
740 |
if click_mode == 'Continuous':
|
|
|
772 |
raise NotImplementedError
|
773 |
|
774 |
async def chat_input_callback(*args):
|
775 |
+
visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history = args
|
776 |
message = chat_input["text"]
|
777 |
prompt="Please help me answer the question with this painting {question} in {language}."
|
778 |
prompt=prompt.format(question=message, language=language)
|
779 |
+
|
780 |
if visual_chatgpt is not None:
|
781 |
+
result=get_gpt_response(api_key, image_input,prompt+message,history)
|
|
|
782 |
read_info = re.sub(r'[#[\]!*]','',result)
|
783 |
read_info = emoji.replace_emoji(read_info,replace="")
|
784 |
+
state = state + [(message,result)]
|
785 |
+
log_state += [(message,result)]
|
786 |
+
# log_state += [("%% chat messahe %%",None)]
|
787 |
+
|
788 |
+
history.append({"role": "user", "content": message})
|
789 |
+
history.append({"role": "assistant", "content": result})
|
790 |
+
|
791 |
if autoplay==False:
|
792 |
+
return state, state, aux_state, None,log_state,history
|
793 |
|
794 |
else:
|
795 |
+
audio = await texttospeech(read_info,language,gender)
|
796 |
+
return state, state, aux_state, audio,log_state,history
|
797 |
else:
|
798 |
response = "Text refiner is not initilzed, please input openai api key."
|
799 |
state = state + [(chat_input, response)]
|
800 |
+
audio = await texttospeech(response,language,gender)
|
801 |
+
return state, state, None, audio,log_state,history
|
|
|
802 |
|
803 |
|
804 |
+
def upload_callback(image_input, state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None):
|
805 |
print("narritive", narritive)
|
806 |
+
|
807 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
808 |
image_input = image_input['background']
|
809 |
|
|
|
814 |
|
815 |
|
816 |
click_state = [[], [], []]
|
817 |
+
|
818 |
+
|
819 |
+
# width, height = image_input.size
|
820 |
+
|
821 |
+
# target_width=500
|
822 |
+
# target_height=650
|
823 |
+
|
824 |
+
# width_ratio = target_width / width
|
825 |
+
# height_ratio = target_height / height
|
826 |
+
# ratio = min(width_ratio, height_ratio)
|
827 |
+
|
828 |
+
# if ratio < 1.0:
|
829 |
+
# new_size = (int(width * ratio), int(height * ratio))
|
830 |
+
# image_input = image_input.resize(new_size, Image.ANTIALIAS)
|
831 |
+
|
832 |
image_input = image_resize(image_input, res=1024)
|
833 |
|
834 |
model = build_caption_anything_with_models(
|
|
|
850 |
image_input.save(new_image_path)
|
851 |
visual_chatgpt.current_image = new_image_path
|
852 |
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
|
|
|
|
|
|
|
|
|
|
|
|
853 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
854 |
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
|
855 |
+
print(parsed_data)
|
856 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
857 |
name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
|
858 |
gender=gender.lower()
|
859 |
print("gender",gender)
|
860 |
+
|
861 |
|
862 |
|
863 |
if language=="English":
|
|
|
903 |
None,
|
904 |
f"🎨 你好,让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域,并选择四种信息类型之一:描述、分析、解读和评判。根据你的选择,我会从画面上事物的视角为你提供相关的见解和想法。"
|
905 |
)
|
906 |
+
]
|
907 |
+
|
908 |
+
log_state += [(name,None)]
|
909 |
+
log_state=log_state+[(paragraph,None)]
|
910 |
+
log_state=log_state+[(narritive,None)]
|
911 |
+
log_state=log_state+state
|
912 |
+
log_state = log_state + [("%% basic information %%", None)]
|
913 |
+
|
914 |
+
history=[]
|
915 |
+
history.append({"role": "assistant", "content": paragraph+state[0][1]})
|
916 |
|
917 |
+
|
|
|
918 |
|
919 |
return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
|
920 |
+
original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history]
|
921 |
|
922 |
|
923 |
|
|
|
956 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
957 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
958 |
# state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
|
|
|
|
|
|
|
|
|
|
|
959 |
|
960 |
+
|
961 |
|
962 |
+
print(prompt["input_label"][-1])
|
963 |
+
if language=="English":
|
964 |
+
if prompt["input_label"][-1]==1:
|
965 |
+
msg="You've added an area at {}. ".format(prompt["input_point"][-1])
|
966 |
+
else:
|
967 |
+
msg="You've removed an area at {}. ".format(prompt["input_point"][-1])
|
968 |
+
else:
|
969 |
+
if prompt["input_label"][-1]==1:
|
970 |
+
msg="你添加了在 {} 的区域。 ".format(prompt["input_point"][-1])
|
971 |
+
else:
|
972 |
+
msg="你删除了在 {} 的区域。 ".format(prompt["input_point"][-1])
|
973 |
|
974 |
+
state = state + [(msg, None)]
|
975 |
+
|
976 |
input_mask = np.array(out['mask'].convert('P'))
|
977 |
image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
|
978 |
|
|
|
983 |
out_state = out
|
984 |
|
985 |
if visual_chatgpt is not None:
|
|
|
986 |
new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
|
987 |
Image.open(out["crop_save_path"]).save(new_crop_save_path)
|
988 |
+
print("new crop save",new_crop_save_path)
|
|
|
|
|
|
|
|
|
989 |
|
990 |
yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
991 |
|
992 |
|
993 |
+
query_focus_en = [
|
994 |
+
"Provide a description of the item.",
|
995 |
+
"Provide a description and analysis of the item.",
|
996 |
+
"Provide a description, analysis, and interpretation of the item.",
|
997 |
+
"Evaluate the item."
|
998 |
+
]
|
999 |
+
|
1000 |
+
query_focus_zh = [
|
1001 |
+
"请描述一下这个物体。",
|
1002 |
+
"请描述和分析一下这个物体。",
|
1003 |
+
"请描述、分析和解释一下这个物体。",
|
1004 |
+
"请以艺术鉴赏的角度评价一下这个物体。"
|
1005 |
+
]
|
1006 |
|
1007 |
|
1008 |
async def submit_caption(naritive, state,length, sentiment, factuality, language,
|
1009 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1010 |
+
autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender,log_state,history):
|
1011 |
|
1012 |
|
1013 |
+
focus_value=focus_map[focus_type]
|
|
|
1014 |
click_index = click_index_state
|
1015 |
|
|
|
|
|
|
|
|
|
|
|
1016 |
print("click_index",click_index)
|
1017 |
print("input_points_state",input_points_state)
|
1018 |
print("input_labels_state",input_labels_state)
|
1019 |
|
1020 |
prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
|
1021 |
|
1022 |
+
log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
|
1023 |
+
|
1024 |
+
|
1025 |
print("Prompt:", prompt)
|
1026 |
print("click",click_index)
|
1027 |
|
1028 |
+
log_state = log_state + [(naritive, None)]
|
1029 |
+
|
|
|
1030 |
|
1031 |
# if not args.disable_gpt and text_refiner:
|
1032 |
if not args.disable_gpt:
|
1033 |
print("new crop save",new_crop_save_path)
|
1034 |
+
focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt,history)
|
1035 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
1036 |
focus_info=focus_info[1:-1]
|
1037 |
focus_info=focus_info.replace('#', '')
|
1038 |
# state = state + [(None, f"Wiki: {paragraph}")]
|
1039 |
+
if language=="English":
|
1040 |
+
user_query=query_focus_en[focus_value]
|
1041 |
+
|
1042 |
+
else:
|
1043 |
+
user_query=query_focus_zh[focus_value]
|
1044 |
+
|
1045 |
+
state = state + [(user_query, f"{focus_info}")]
|
1046 |
+
log_state = log_state + [(user_query, None)]
|
1047 |
+
log_state = log_state + [(None, f"{focus_info}")]
|
1048 |
+
|
1049 |
+
# save history
|
1050 |
+
history.append({"role": "user", "content": user_query})
|
1051 |
+
history.append({"role": "assistant", "content": focus_info})
|
1052 |
+
|
1053 |
+
|
1054 |
+
|
1055 |
print("new_cap",focus_info)
|
1056 |
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
1057 |
read_info = emoji.replace_emoji(read_info,replace="")
|
|
|
1067 |
print("error gpt responese")
|
1068 |
print("item gender",gender)
|
1069 |
|
|
|
|
|
1070 |
try:
|
1071 |
if autoplay==False:
|
1072 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,log_state,history
|
1073 |
|
1074 |
+
audio_output = await texttospeech(read_info, language,gender)
|
1075 |
print("done")
|
1076 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1077 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
|
1078 |
|
1079 |
except Exception as e:
|
1080 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
1081 |
print(f"Error during TTS prediction: {str(e)}")
|
1082 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
1083 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,log_state,history
|
1084 |
|
1085 |
else:
|
1086 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
1087 |
print(f"Error during TTS prediction: {str(e)}")
|
1088 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
|
1089 |
|
1090 |
|
1091 |
|
|
|
1125 |
with open(image_path, "rb") as image_file:
|
1126 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
1127 |
|
1128 |
+
def get_gpt_response(api_key, image_path, prompt, history=None):
|
|
|
|
|
|
|
|
|
1129 |
|
1130 |
headers = {
|
1131 |
"Content-Type": "application/json",
|
1132 |
"Authorization": f"Bearer {api_key}"
|
1133 |
}
|
|
|
1134 |
|
1135 |
+
if history:
|
1136 |
+
if len(history) > 4:
|
1137 |
+
history = history[-4:]
|
1138 |
+
else:
|
1139 |
+
history = []
|
1140 |
+
|
1141 |
+
messages = history[:]
|
1142 |
+
|
1143 |
if image_path:
|
1144 |
+
base64_image = encode_image(image_path)
|
1145 |
+
messages.append({
|
1146 |
+
"role": "user",
|
1147 |
+
"content": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1148 |
{
|
1149 |
+
"type": "text",
|
1150 |
+
"text": prompt
|
1151 |
+
},
|
1152 |
+
{
|
1153 |
+
"type": "image_url",
|
1154 |
+
"image_url": {
|
1155 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1156 |
}
|
1157 |
+
}
|
1158 |
+
]
|
1159 |
+
})
|
1160 |
+
else:
|
1161 |
+
messages.append({"role": "user",
|
1162 |
+
"content":
|
1163 |
+
{
|
1164 |
+
"type": "text",
|
1165 |
+
"text": prompt
|
1166 |
+
}})
|
1167 |
+
|
1168 |
+
payload = {
|
1169 |
+
"model": "gpt-4o",
|
1170 |
+
"messages": messages,
|
1171 |
+
"max_tokens": 600
|
1172 |
+
}
|
1173 |
+
|
1174 |
|
1175 |
# Sending the request to the OpenAI API
|
1176 |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
1177 |
result = response.json()
|
1178 |
+
print("gpt result",result)
|
1179 |
try:
|
1180 |
content = result['choices'][0]['message']['content']
|
1181 |
return content
|
|
|
1319 |
visual_chatgpt.global_prompt = ""
|
1320 |
|
1321 |
|
1322 |
+
def export_chat_log(chat_state,log_list,narrative):
|
1323 |
try:
|
1324 |
+
chat_log=""
|
1325 |
if not chat_state:
|
1326 |
return None
|
|
|
1327 |
for entry in chat_state:
|
1328 |
user_message, bot_response = entry
|
1329 |
if user_message and bot_response:
|
1330 |
chat_log += f"User: {user_message}\nBot: {bot_response}\n"
|
1331 |
+
elif user_message and user_message.startswith("%%"):
|
1332 |
+
chat_log += f"{user_message}\n"
|
1333 |
elif user_message:
|
1334 |
chat_log += f"User: {user_message}\n"
|
1335 |
+
chat_log += f"///// \n"
|
1336 |
elif bot_response:
|
1337 |
chat_log += f"Bot: {bot_response}\n"
|
1338 |
+
chat_log += f"///// \n"
|
1339 |
|
1340 |
+
print("export log...")
|
1341 |
+
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
1342 |
+
file_name = f"{current_time}_{narrative}.txt"
|
1343 |
+
file_path = os.path.join(os.getcwd(), file_name) # Save to the current working directory
|
1344 |
|
1345 |
+
with open(file_path, 'w', encoding='utf-8') as file:
|
1346 |
+
file.write(chat_log)
|
|
|
1347 |
|
1348 |
+
print(file_path)
|
1349 |
+
|
1350 |
+
log_list.append(file_path)
|
|
|
|
|
|
|
|
|
1351 |
return log_list,log_list
|
1352 |
except Exception as e:
|
1353 |
print(f"An error occurred while exporting the chat log: {e}")
|
1354 |
+
return None,None
|
1355 |
|
1356 |
+
async def get_artistinfo(artist_name,api_key,state,language,autoplay,length,log_state):
|
1357 |
prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
|
1358 |
res=get_gpt_response(api_key,None,prompt)
|
1359 |
state = state + [(None, res)]
|
1360 |
read_info = re.sub(r'[#[\]!*]','',res)
|
1361 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1362 |
+
log_state=log_state+[(f"res", None)]
|
1363 |
|
1364 |
|
1365 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1366 |
# input_points=input_points, input_labels=input_labels)
|
1367 |
if autoplay:
|
1368 |
+
audio_output = await texttospeech(read_info, language)
|
1369 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1370 |
+
return state, state,audio_output,log_state
|
1371 |
+
return state, state,None,log_state
|
1372 |
|
1373 |
|
1374 |
+
async def get_yearinfo(year,api_key,state,language,autoplay,length,log_state):
|
1375 |
prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
|
1376 |
res=get_gpt_response(api_key,None,prompt)
|
1377 |
+
log_state=log_state+[(f"res", None)]
|
1378 |
state = state + [(None, res)]
|
1379 |
read_info = re.sub(r'[#[\]!*]','',res)
|
1380 |
read_info = emoji.replace_emoji(read_info,replace="")
|
|
|
1383 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1384 |
# input_points=input_points, input_labels=input_labels)
|
1385 |
if autoplay:
|
1386 |
+
audio_output = await texttospeech(read_info, language)
|
1387 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1388 |
+
return state, state,audio_output,log_state
|
1389 |
+
return state, state,None,log_state
|
1390 |
|
1391 |
|
1392 |
|
1393 |
|
1394 |
|
1395 |
+
# async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
|
1396 |
|
1397 |
+
# # state = state + [(None, f"Caption Everything: {paragraph}")]
|
1398 |
+
# Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
1399 |
+
# AI_prompt = "Received."
|
1400 |
+
# visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
1401 |
|
1402 |
+
# # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
1403 |
+
# visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
1404 |
+
# # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
1405 |
+
# audio_output=await texttospeech(paragraph,language,autoplay)
|
1406 |
+
# return paragraph,audio_output
|
1407 |
|
1408 |
+
# def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
1409 |
|
1410 |
+
# model = build_caption_anything_with_models(
|
1411 |
+
# args,
|
1412 |
+
# api_key="",
|
1413 |
+
# captioner=shared_captioner,
|
1414 |
+
# sam_model=shared_sam_model,
|
1415 |
+
# ocr_reader=shared_ocr_reader,
|
1416 |
+
# text_refiner=text_refiner,
|
1417 |
+
# session_id=iface.app_id
|
1418 |
+
# )
|
1419 |
+
# paragraph = model.inference_cap_everything(image_input, verbose=True)
|
1420 |
+
# # state = state + [(None, f"Caption Everything: {paragraph}")]
|
1421 |
+
# Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
1422 |
+
# AI_prompt = "Received."
|
1423 |
+
# visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
1424 |
+
# visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
1425 |
+
# # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
1426 |
+
# return paragraph
|
1427 |
|
1428 |
|
1429 |
|
|
|
1515 |
|
1516 |
# return like_state, dislike_state
|
1517 |
|
1518 |
+
async def texttospeech(text, language,gender='female'):
|
1519 |
try:
|
1520 |
+
|
1521 |
+
voice = filtered_language_dict[language][gender]
|
1522 |
+
communicate = edge_tts.Communicate(text=text, voice=voice,rate="+25%")
|
1523 |
+
file_path = "output.wav"
|
1524 |
+
await communicate.save(file_path)
|
1525 |
+
with open(file_path, "rb") as audio_file:
|
1526 |
+
audio_bytes = BytesIO(audio_file.read())
|
1527 |
+
audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
|
1528 |
+
print("TTS processing completed.")
|
1529 |
+
audio_style = 'style="width:210px;"'
|
1530 |
+
audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
|
|
|
|
|
|
|
1531 |
return audio_player
|
1532 |
+
|
1533 |
except Exception as e:
|
1534 |
print(f"Error in texttospeech: {e}")
|
1535 |
return None
|
1536 |
|
1537 |
+
# give the reason of recommendation
|
1538 |
+
async def associate(focus_info,openai_api_key,language,autoplay,length,log_state,sort_score,evt: gr.SelectData,narritive=None):
|
1539 |
rec_path=evt._data['value']['image']['path']
|
1540 |
+
index=evt.index
|
1541 |
print("rec_path",rec_path)
|
1542 |
prompt="""
|
1543 |
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects in the second painting that may be related to the selected object and list one fact of selected object, one fact of related object in the second painting and one analysis between two objects as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
|
1544 |
"""
|
1545 |
prompt=prompt.format(Wiki_caption=focus_info,language=language,length=length)
|
1546 |
result=get_gpt_response(openai_api_key, rec_path, prompt)
|
1547 |
+
print("recommend result",result)
|
1548 |
+
reason = [(None, f"{result}")]
|
1549 |
+
log_state = log_state + [(narritive, None)]
|
1550 |
+
log_state = log_state + [(f"image sort ranking {sort_score}", None)]
|
1551 |
+
log_state = log_state + [(None, f"{result}")]
|
1552 |
read_info = re.sub(r'[#[\]!*]','',result)
|
1553 |
read_info = emoji.replace_emoji(read_info,replace="")
|
1554 |
print("associate",read_info)
|
1555 |
if autoplay:
|
1556 |
+
audio_output = await texttospeech(read_info, language)
|
1557 |
+
return reason,audio_output,log_state,index
|
1558 |
+
return reason,None,log_state,index
|
|
|
|
|
1559 |
|
1560 |
+
def change_naritive(task_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
|
1561 |
+
if task_type=="Session 1":
|
1562 |
+
return None, [], [], [[], [], []], "", None, []
|
1563 |
+
else:
|
1564 |
+
if language=="English":
|
1565 |
+
if narritive=="Third-person" :
|
1566 |
+
state += [
|
1567 |
+
(
|
1568 |
+
None,
|
1569 |
+
f"🤖 Hi, I am EyeSee. Let's explore this painting together."
|
1570 |
+
)
|
1571 |
+
]
|
1572 |
+
elif narritive=="Single-Persona: Artist":
|
1573 |
+
state += [
|
1574 |
+
(
|
1575 |
+
None,
|
1576 |
+
f"🧑🎨 Let's delve into it from the perspective of the artist."
|
1577 |
+
)
|
1578 |
+
]
|
1579 |
+
elif narritive=="Multi-Persona: Objects":
|
1580 |
+
state += [
|
1581 |
+
(
|
1582 |
+
None,
|
1583 |
+
f"🎨 Let's delve into it from the perspective of the objects depicted in the scene."
|
1584 |
+
)
|
1585 |
+
]
|
1586 |
+
elif language=="Chinese":
|
1587 |
+
if narritive=="Third-person" :
|
1588 |
+
state += [
|
1589 |
+
(
|
1590 |
+
None,
|
1591 |
+
"🤖 让我们从第三方视角一起探索这幅画吧。"
|
1592 |
+
)
|
1593 |
+
]
|
1594 |
+
elif narritive == "Single-Persona: Artist":
|
1595 |
+
state += [
|
1596 |
+
(
|
1597 |
+
None,
|
1598 |
+
"🧑🎨 让我们从艺术家的视角深入探索这幅画。"
|
1599 |
+
)
|
1600 |
+
]
|
1601 |
+
elif narritive == "Multi-Persona: Objects":
|
1602 |
+
state += [
|
1603 |
+
(
|
1604 |
+
None,
|
1605 |
+
"🎨 让我们从画面中事物的视角深入探索这幅画。"
|
1606 |
+
)
|
1607 |
+
]
|
1608 |
|
1609 |
+
return image_input, state, state, click_state, paragraph, origin_image
|
1610 |
|
1611 |
|
1612 |
+
def print_like_dislike(x: gr.LikeData,state,log_state):
|
1613 |
print(x.index, x.value, x.liked)
|
1614 |
if x.liked == True:
|
1615 |
print("liked")
|
1616 |
+
log_state=log_state+[(f"User liked this message", None)]
|
|
|
1617 |
state = state + [(None, f"Liked Received 👍")]
|
1618 |
else:
|
1619 |
+
log_state=log_state+[(f"User disliked this message", None)]
|
1620 |
state = state + [(None, f"Disliked Received 👎")]
|
1621 |
+
log_state+=[("%% user interaction %%", None)]
|
1622 |
+
return log_state,state
|
1623 |
|
1624 |
+
def get_recommendationscore(index,score,log_state):
|
1625 |
+
log_state+=[(f"Picture {index} : {score}",None)]
|
1626 |
+
log_state+=[("%% recommendation %%",None)]
|
1627 |
+
return log_state
|
1628 |
+
|
1629 |
+
|
1630 |
+
|
1631 |
|
1632 |
def toggle_icons_and_update_prompt(point_prompt):
|
1633 |
new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
|
|
|
1650 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
1651 |
|
1652 |
examples = [
|
1653 |
+
["test_images/1.The Ambassadors.jpg"],
|
1654 |
+
["test_images/2.Football Players.jpg"],
|
1655 |
+
["test_images/3.Along the River during the Qingming Festival.jpeg"],
|
1656 |
+
# ["test_images/test3.jpg"],
|
1657 |
+
# ["test_images/test4.jpg"],
|
1658 |
+
# ["test_images/test5.jpg"],
|
1659 |
+
# ["test_images/Picture5.png"],
|
1660 |
|
1661 |
]
|
1662 |
|
|
|
1664 |
css=css,
|
1665 |
theme=gr.themes.Base()
|
1666 |
) as iface:
|
1667 |
+
#display in the chatbox
|
1668 |
state = gr.State([])
|
1669 |
+
# expoer in log
|
1670 |
+
log_state=gr.State([])
|
1671 |
+
# history log for gpt
|
1672 |
+
history_log=gr.State([])
|
1673 |
+
|
1674 |
out_state = gr.State(None)
|
1675 |
click_state = gr.State([[], [], []])
|
1676 |
origin_image = gr.State(None)
|
|
|
1685 |
input_mask_state = gr.State(np.zeros((1, 1)))
|
1686 |
input_points_state = gr.State([])
|
1687 |
input_labels_state = gr.State([])
|
1688 |
+
#store the selected image
|
1689 |
new_crop_save_path = gr.State(None)
|
1690 |
image_input_nobackground = gr.State(None)
|
1691 |
artist=gr.State(None)
|
|
|
|
|
1692 |
gr.Markdown(title)
|
1693 |
gr.Markdown(description)
|
1694 |
point_prompt = gr.State("Positive")
|
1695 |
log_list=gr.State([])
|
1696 |
gender=gr.State('female')
|
1697 |
+
# store the whole image path
|
1698 |
image_path=gr.State('')
|
1699 |
+
pic_index=gr.State(None)
|
1700 |
+
|
1701 |
+
|
1702 |
+
with gr.Row():
|
1703 |
+
auto_play = gr.Checkbox(
|
1704 |
+
label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
|
1705 |
+
)
|
1706 |
+
output_audio = gr.HTML(
|
1707 |
+
label="Synthesised Audio", elem_classes="custom-output"
|
1708 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1709 |
with gr.Row():
|
1710 |
with gr.Column(scale=6):
|
1711 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1712 |
+
|
1713 |
+
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
1714 |
image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1715 |
with gr.Row():
|
1716 |
name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
|
|
|
1718 |
year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1719 |
material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
|
1720 |
|
1721 |
+
with gr.Tab("Base2",visible=False) as base_tab2:
|
1722 |
image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
|
1723 |
with gr.Row():
|
1724 |
name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
|
|
|
1736 |
artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
|
1737 |
year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
|
1738 |
material_label = gr.Button(value="Style: ",elem_classes="info_btn")
|
1739 |
+
|
1740 |
+
with gr.Row():
|
1741 |
+
gr.Examples(
|
1742 |
+
examples=examples,
|
1743 |
+
inputs=[example_image],
|
1744 |
+
)
|
1745 |
|
1746 |
|
1747 |
# example_image_click = gr.Image(type="pil", interactive=False, visible=False)
|
|
|
1750 |
add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
|
1751 |
minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
|
1752 |
clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
|
1753 |
+
focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
|
1754 |
+
focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
|
1755 |
+
focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
|
|
|
1756 |
focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
|
1757 |
|
1758 |
recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
|
|
|
1819 |
value="No",
|
1820 |
label="Expert",
|
1821 |
interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1822 |
with gr.Column(visible=True) as modules_not_need_gpt3:
|
1823 |
gr.Examples(
|
1824 |
examples=examples,
|
1825 |
inputs=[example_image],
|
1826 |
)
|
1827 |
+
|
1828 |
+
|
1829 |
|
1830 |
|
1831 |
|
|
|
1841 |
type="password")
|
1842 |
with gr.Row():
|
1843 |
enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
|
1844 |
+
# disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
|
1845 |
+
# variant='primary')
|
1846 |
with gr.Column(visible=False) as module_notification_box:
|
1847 |
notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
|
1848 |
|
1849 |
+
# with gr.Column() as modules_need_gpt0:
|
1850 |
+
# with gr.Column(visible=False) as modules_need_gpt2:
|
1851 |
+
# paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
|
1852 |
+
# cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1853 |
|
1854 |
+
|
1855 |
+
|
1856 |
+
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1857 |
+
with gr.Row():
|
1858 |
+
naritive = gr.Radio(
|
1859 |
+
choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
|
1860 |
+
value="Third-person",
|
1861 |
+
label="Persona",
|
1862 |
+
scale=5,
|
1863 |
+
interactive=True)
|
1864 |
with gr.Blocks():
|
1865 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
|
1866 |
+
with gr.Column() as modules_need_gpt3:
|
1867 |
chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
|
1868 |
with gr.Row():
|
1869 |
clear_button_text = gr.Button(value="Clear Chat", interactive=True)
|
|
|
1871 |
# submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
|
1872 |
# upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
|
1873 |
# downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
|
1874 |
+
|
1875 |
+
|
1876 |
+
|
|
|
|
|
|
|
|
|
1877 |
|
1878 |
|
1879 |
# TTS interface hidden initially
|
|
|
1890 |
with gr.Row():
|
1891 |
submit_tts = gr.Button(value="Submit", interactive=True)
|
1892 |
clear_tts = gr.Button(value="Clear", interactive=True)
|
1893 |
+
|
1894 |
+
with gr.Row():
|
1895 |
+
with gr.Column(scale=6):
|
1896 |
+
with gr.Column(visible=False) as recommend:
|
1897 |
+
gallery_result = gr.Gallery(
|
1898 |
+
label="Recommendations",
|
1899 |
+
height="auto",
|
1900 |
+
columns=4
|
1901 |
+
# columns=4,
|
1902 |
+
# rows=2,
|
1903 |
+
# show_label=False,
|
1904 |
+
# allow_preview=True,
|
1905 |
+
# object_fit="contain",
|
1906 |
+
# height="auto",
|
1907 |
+
# preview=True,
|
1908 |
+
# show_share_button=True,
|
1909 |
+
# show_download_button=True
|
1910 |
+
)
|
1911 |
+
sort_rec=gr.Dropdown(["1", "2", "3", "4"],
|
1912 |
+
value=[],
|
1913 |
+
multiselect=True,
|
1914 |
+
label="Score", info="Please sort the pictures according to your preference"
|
1915 |
+
)
|
1916 |
+
|
1917 |
+
with gr.Column(scale=4,visible=False) as reco_reasons:
|
1918 |
+
recommend_bot = gr.Chatbot(label="Recommend Reasons", elem_classes="chatbot",height=600)
|
1919 |
+
recommend_score = gr.Radio(
|
1920 |
+
choices=[0,1,2,3,4,5],
|
1921 |
+
label="Score",
|
1922 |
+
interactive=True)
|
1923 |
+
|
1924 |
+
|
1925 |
+
|
1926 |
+
|
1927 |
+
|
1928 |
+
|
1929 |
###############################################################################
|
1930 |
############# this part is for text to image #############
|
1931 |
###############################################################################
|
1932 |
+
|
1933 |
with gr.Row(variant="panel",visible=False) as text2image_model:
|
1934 |
|
1935 |
with gr.Column():
|
|
|
2024 |
# # show_download_button=True
|
2025 |
# )
|
2026 |
|
2027 |
+
with gr.Row(visible=False) as export:
|
|
|
2028 |
chat_log_file = gr.File(label="Download Chat Log",scale=5)
|
2029 |
+
|
2030 |
+
with gr.Row(elem_id="top_row",visible=False) as top_row:
|
2031 |
+
task_type = gr.Dropdown(
|
2032 |
+
["Session 1","Session 2"],
|
2033 |
+
value="Session 1", label="Task", interactive=True, elem_classes="custom-language"
|
2034 |
+
)
|
2035 |
language = gr.Dropdown(
|
2036 |
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
2037 |
value="English", label="Language", interactive=True, elem_classes="custom-language"
|
|
|
2044 |
interactive=True,
|
2045 |
label="Generated Caption Length",
|
2046 |
)
|
2047 |
+
# auto_play = gr.Checkbox(
|
2048 |
+
# label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
|
2049 |
+
# )
|
2050 |
+
# output_audio = gr.HTML(
|
2051 |
+
# label="Synthesised Audio", elem_classes="custom-output"
|
2052 |
+
# )
|
2053 |
|
2054 |
|
2055 |
|
|
|
2090 |
# )
|
2091 |
recommend_btn.click(
|
2092 |
fn=infer,
|
2093 |
+
inputs=[new_crop_save_path,image_path],
|
2094 |
outputs=[gallery_result]
|
2095 |
)
|
2096 |
|
2097 |
gallery_result.select(
|
2098 |
associate,
|
2099 |
+
inputs=[paragraph,openai_api_key,language,auto_play,length,log_state,sort_rec],
|
2100 |
+
outputs=[recommend_bot,output_audio,log_state,pic_index],
|
2101 |
|
2102 |
|
2103 |
)
|
|
|
2199 |
|
2200 |
# mv_images = gr.State()
|
2201 |
|
2202 |
+
chatbot.like(print_like_dislike, inputs=[state,log_state], outputs=[log_state,chatbot])
|
2203 |
|
2204 |
# submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
2205 |
# fn=generate_mvs,
|
|
|
2244 |
# queue=False,
|
2245 |
# show_progress=False
|
2246 |
# )
|
2247 |
+
|
2248 |
+
recommend_score.select(
|
2249 |
+
get_recommendationscore,
|
2250 |
+
inputs=[pic_index,recommend_score,log_state],
|
2251 |
+
outputs=[log_state],
|
2252 |
+
)
|
2253 |
|
2254 |
|
2255 |
|
2256 |
|
2257 |
|
2258 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
2259 |
+
outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt,
|
2260 |
+
modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
|
2261 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
2262 |
+
outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
2263 |
modules_not_need_gpt,
|
2264 |
+
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,modules_not_need_gpt3])
|
2265 |
+
|
2266 |
+
# disable_chatGPT_button.click(init_wo_openai_api_key,
|
2267 |
+
# outputs=[export,modules_need_gpt1, modules_need_gpt3,
|
2268 |
+
# modules_not_need_gpt,
|
2269 |
+
# modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
|
2270 |
+
|
2271 |
+
# artist_label_base2.click(
|
2272 |
+
# get_artistinfo,
|
2273 |
+
# inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
|
2274 |
+
# outputs=[chatbot,state,output_audio]
|
2275 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2276 |
artist_label.click(
|
2277 |
get_artistinfo,
|
2278 |
+
inputs=[artist_label,openai_api_key,state,language,auto_play,length,log_state],
|
2279 |
+
outputs=[chatbot,state,output_audio,log_state]
|
|
|
|
|
|
|
|
|
|
|
2280 |
)
|
2281 |
+
# artist_label_traj.click(
|
2282 |
+
# get_artistinfo,
|
2283 |
+
# inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
|
2284 |
+
# outputs=[chatbot,state,output_audio]
|
2285 |
+
# )
|
2286 |
|
2287 |
+
# year_label_base2.click(
|
2288 |
+
# get_yearinfo,
|
2289 |
+
# inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
|
2290 |
+
# outputs=[chatbot,state,output_audio]
|
2291 |
+
# )
|
2292 |
year_label.click(
|
2293 |
get_yearinfo,
|
2294 |
+
inputs=[year_label,openai_api_key,state,language,auto_play,length,log_state],
|
2295 |
+
outputs=[chatbot,state,output_audio,log_state]
|
|
|
|
|
|
|
|
|
|
|
2296 |
)
|
2297 |
+
# year_label_traj.click(
|
2298 |
+
# get_yearinfo,
|
2299 |
+
# inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
|
2300 |
+
# outputs=[chatbot,state,output_audio]
|
2301 |
+
# )
|
2302 |
|
2303 |
|
2304 |
+
# enable_chatGPT_button.click(
|
2305 |
+
# lambda: (None, [], [], [[], [], []], "", "", ""),
|
2306 |
+
# [],
|
2307 |
+
# [image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2308 |
+
# queue=False,
|
2309 |
+
# show_progress=False
|
2310 |
+
# )
|
2311 |
+
# openai_api_key.submit(
|
2312 |
+
# lambda: (None, [], [], [[], [], []], "", "", ""),
|
2313 |
+
# [],
|
2314 |
+
# [image_input, chatbot, state, click_state, paragraph_output, origin_image],
|
2315 |
+
# queue=False,
|
2316 |
+
# show_progress=False
|
2317 |
+
# )
|
2318 |
|
2319 |
+
# cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
|
2320 |
+
# [paragraph_output,output_audio])
|
2321 |
|
2322 |
clear_button_click.click(
|
2323 |
lambda x: ([[], [], []], x),
|
|
|
2327 |
show_progress=False
|
2328 |
)
|
2329 |
clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
|
2330 |
+
# clear_button_image.click(
|
2331 |
+
# lambda: (None, [], [], [[], [], []], "", "", ""),
|
2332 |
+
# [],
|
2333 |
+
# [image_input, chatbot, state, click_state, paragraph, origin_image],
|
2334 |
+
# queue=False,
|
2335 |
+
# show_progress=False
|
2336 |
+
# )
|
2337 |
+
# clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
|
2338 |
clear_button_text.click(
|
2339 |
+
lambda: ([], [], [[], [], [], []],[]),
|
2340 |
[],
|
2341 |
+
[chatbot, state, click_state,history_log],
|
2342 |
queue=False,
|
2343 |
show_progress=False
|
2344 |
)
|
2345 |
clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
|
2346 |
|
2347 |
image_input.clear(
|
2348 |
+
lambda: (None, [], [], [[], [], []], "", None, []),
|
2349 |
[],
|
2350 |
+
[image_input, chatbot, state, click_state, paragraph, origin_image,history_log],
|
2351 |
queue=False,
|
2352 |
show_progress=False
|
2353 |
)
|
2354 |
|
2355 |
image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
|
2356 |
+
|
2357 |
+
# image_input.change(
|
2358 |
+
# lambda: ([], [], [[], [], []], [], []),
|
2359 |
+
# [],
|
2360 |
+
# [chatbot, state, click_state, history_log, log_state],
|
2361 |
+
# queue=False,
|
2362 |
+
# show_progress=False
|
2363 |
+
# )
|
2364 |
|
2365 |
|
2366 |
|
2367 |
|
2368 |
+
# image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
|
2369 |
+
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2370 |
+
# image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2371 |
+
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2372 |
+
# paragraph,artist,gender,image_path])
|
2373 |
|
2374 |
+
# image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive],
|
2375 |
+
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2376 |
+
# image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2377 |
+
# name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2378 |
+
# paragraph,artist,gender,image_path])
|
2379 |
|
2380 |
+
image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log],
|
2381 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2382 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2383 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2384 |
+
paragraph,artist,gender,image_path,log_state,history_log])
|
2385 |
|
2386 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
|
2387 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
|
|
2395 |
# sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
|
2396 |
# [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
2397 |
# image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
|
2398 |
+
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log],
|
2399 |
+
[chatbot, state, aux_state,output_audio,log_state,history_log])
|
2400 |
# chat_input.submit(lambda: "", None, chat_input)
|
2401 |
chat_input.submit(lambda: {"text": ""}, None, chat_input)
|
2402 |
# submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
2403 |
# [chatbot, state, aux_state,output_audio])
|
2404 |
# submit_button_text.click(lambda: "", None, chat_input)
|
2405 |
+
example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log],
|
2406 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
|
2407 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
|
2408 |
name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
|
2409 |
+
paragraph,artist,gender,image_path, log_state,history_log])
|
2410 |
|
2411 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
2412 |
|
2413 |
+
# def on_click_tab_selected():
|
2414 |
+
# if gpt_state ==1:
|
2415 |
+
# print(gpt_state)
|
2416 |
+
# print("using gpt")
|
2417 |
+
# return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
|
2418 |
+
# else:
|
2419 |
+
# print("no gpt")
|
2420 |
+
# print("gpt_state",gpt_state)
|
2421 |
+
# return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
|
2422 |
+
|
2423 |
+
# def on_base_selected():
|
2424 |
+
# if gpt_state ==1:
|
2425 |
+
# print(gpt_state)
|
2426 |
+
# print("using gpt")
|
2427 |
+
# return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
|
2428 |
+
# else:
|
2429 |
+
# print("no gpt")
|
2430 |
+
# return [gr.update(visible=False)]*4
|
2431 |
+
|
2432 |
+
|
2433 |
+
# traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2434 |
+
# click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
2435 |
+
# base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
|
2436 |
+
# base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
|
2437 |
|
2438 |
|
2439 |
|
|
|
2443 |
inputs=[
|
2444 |
origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
|
2445 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
2446 |
+
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
|
2447 |
],
|
2448 |
outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
|
2449 |
show_progress=False, queue=True
|
|
|
2454 |
submit_caption,
|
2455 |
inputs=[
|
2456 |
naritive, state,length, sentiment, factuality, language,
|
2457 |
+
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender,log_state,history_log
|
2458 |
],
|
2459 |
outputs=[
|
2460 |
+
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
|
2461 |
],
|
2462 |
show_progress=True,
|
2463 |
queue=True
|
|
|
2471 |
submit_caption,
|
2472 |
inputs=[
|
2473 |
naritive,state,length, sentiment, factuality, language,
|
2474 |
+
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path,gender,log_state,
|
2475 |
+
history_log
|
2476 |
],
|
2477 |
outputs=[
|
2478 |
+
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
|
2479 |
+
],
|
2480 |
show_progress=True,
|
2481 |
queue=True
|
2482 |
)
|
|
|
2487 |
inputs=[
|
2488 |
naritive,state,length, sentiment, factuality, language,
|
2489 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2490 |
+
auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path,gender,log_state,history_log
|
2491 |
],
|
2492 |
outputs=[
|
2493 |
+
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
|
2494 |
],
|
2495 |
show_progress=True,
|
2496 |
queue=True
|
|
|
2502 |
inputs=[
|
2503 |
naritive,state,length, sentiment, factuality, language,
|
2504 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
2505 |
+
auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path,gender,log_state,history_log
|
2506 |
],
|
2507 |
outputs=[
|
2508 |
+
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,log_state,history_log
|
2509 |
],
|
2510 |
show_progress=True,
|
2511 |
queue=True
|
|
|
2545 |
|
2546 |
export_button.click(
|
2547 |
export_chat_log,
|
2548 |
+
inputs=[log_state,log_list,naritive],
|
2549 |
outputs=[chat_log_file,log_list],
|
2550 |
queue=True
|
2551 |
)
|
2552 |
|
2553 |
naritive.change(
|
2554 |
+
change_naritive,
|
2555 |
+
[task_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
|
2556 |
+
[image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
|
2557 |
queue=False,
|
2558 |
show_progress=False
|
2559 |
|
2560 |
)
|
2561 |
|
2562 |
+
task_type.change(
|
2563 |
+
lambda: ([]),
|
2564 |
+
[],
|
2565 |
+
[log_state]
|
2566 |
+
)
|
2567 |
+
|
2568 |
# upvote_btn.click(
|
2569 |
# handle_liked,
|
2570 |
# inputs=[state,like_res],
|