Spaces:

xxx1
/

VQA_CAP_GPT

Runtime error

App Files Files Community

VQA_CAP_GPT / app.py

xxx1's picture

Update app.py

f66ceb8 almost 2 years ago

history blame contribute delete

9.9 kB

	import string
	import gradio as gr
	import requests
	import torch
	from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
	from PIL import Image

	model_name="hfl/vle-base-for-vqa"
	model = VLEForVQA.from_pretrained(model_name)
	vle_processor = VLEProcessor.from_pretrained(model_name)
	vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)


	from transformers import BlipForQuestionAnswering, BlipProcessor

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
	model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)

	from transformers import BlipProcessor, BlipForConditionalGeneration

	cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")



	def caption(input_image):
	inputs = cap_processor(input_image, return_tensors="pt")
	# inputs["num_beams"] = 1
	# inputs['num_return_sequences'] =1
	out = cap_model.generate(**inputs)
	return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True))
	import openai
	import os
	openai.api_key= os.getenv('openai_appkey')
	def gpt3_short(question,vqa_answer,caption):
	vqa_answer,vqa_score=vqa_answer
	prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\
	"; B: "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; C: "+vqa_answer[2]+", score:"+str(vqa_score[2])+\
	"; D: "+vqa_answer[3]+', score:'+str(vqa_score[3])+\
	". Choose A if it is not in conflict with the description of the picture and A's score is bigger than 0.8; otherwise choose the B, C or D based on the description."

	# prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
	response = openai.Completion.create(
	engine="text-davinci-003",
	prompt=prompt,
	max_tokens=10,
	n=1,
	stop=None,
	temperature=0.7,
	)
	answer = response.choices[0].text.strip()

	llm_ans=answer
	choice=set(["A","B","C","D"])
	llm_ans=llm_ans.replace("\n"," ").replace(":"," ").replace("."," " ).replace(","," ")
	sllm_ans=llm_ans.split(" ")
	for cho in sllm_ans:
	if cho in choice:
	llm_ans=cho
	break
	if llm_ans not in choice:
	llm_ans="A"
	llm_ans=vqa_answer[ord(llm_ans)-ord("A")]
	answer=llm_ans

	return answer
	def gpt3_long(question,vqa_answer,caption):
	vqa_answer,vqa_score=vqa_answer
	# prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+"socre:"+str(vqa_score[0])+\
	# " B: "+vqa_answer[1]+" score:"+str(vqa_score[1])+" C: "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
	# " D: "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
	# "Tell me the right answer with a long sentence."
	prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\
	"; "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; "+vqa_answer[2]+", score:"+str(vqa_score[2])+\
	"; "+vqa_answer[3]+', score:'+str(vqa_score[3])+\
	". Question: "+question+" Tell me the right answer with a sentence."
	# prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+" socre:"+str(vqa_score[0])+\
	# " "+vqa_answer[1]+" score:"+str(vqa_score[1])+" "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
	# " "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
	# "Tell me the right answer with a long sentence."
	# prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
	response = openai.Completion.create(
	engine="text-davinci-003",
	prompt=prompt,
	max_tokens=30,
	n=1,
	stop=None,
	temperature=0.7,
	)
	answer = response.choices[0].text.strip()
	return answer
	def gpt3(question,vqa_answer,caption):
	prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
	response = openai.Completion.create(
	engine="text-davinci-003",
	prompt=prompt,
	max_tokens=30,
	n=1,
	stop=None,
	temperature=0.7,
	)
	answer = response.choices[0].text.strip()
	# return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer
	return answer

	def vle(input_image,input_text):
	vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4)
	# return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers]
	return [vqa['answer'] for vqa in vqa_answers],[vqa['score'] for vqa in vqa_answers]
	def inference_chat(input_image,input_text):
	cap=caption(input_image)
	print(cap)
	# inputs = processor(images=input_image, text=input_text,return_tensors="pt")
	# inputs["max_length"] = 10
	# inputs["num_beams"] = 5
	# inputs['num_return_sequences'] =4
	# out = model_vqa.generate(**inputs)
	# out=processor.batch_decode(out, skip_special_tokens=True)

	out=vle(input_image,input_text)
	# vqa="\n".join(out[0])
	# gpt3_out=gpt3(input_text,vqa,cap)
	gpt3_out=gpt3_long(input_text,out,cap)
	gpt3_out1=gpt3_short(input_text,out,cap)
	return out[0][0], gpt3_out,gpt3_out1
	title = """# VQA with VLE and LLM"""
	description = """VLE (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
	We demonstrate visual question answering systems built with VLE and LLM."""
	description1 = """VQA: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.

	VQA+LLM: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""

	with gr.Blocks(
	css="""
	.message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
	#component-21 > div.wrap.svelte-w6rprc {height: 600px;}
	"""
	) as iface:
	state = gr.State([])
	#caption_output = None
	gr.Markdown(title)
	gr.Markdown(description)
	#gr.Markdown(article)

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(type="pil",label="VQA Image Input")
	with gr.Row():
	with gr.Column(scale=1):
	chat_input = gr.Textbox(lines=1, label="VQA Question Input")
	with gr.Row():
	clear_button = gr.Button(value="Clear", interactive=True,width=30)
	submit_button = gr.Button(
	value="Submit", interactive=True, variant="primary"
	)
	'''
	cap_submit_button = gr.Button(
	value="Submit_CAP", interactive=True, variant="primary"
	)
	gpt3_submit_button = gr.Button(
	value="Submit_GPT3", interactive=True, variant="primary"
	)
	'''
	with gr.Column():
	gr.Markdown(description1)
	caption_output = gr.Textbox(lines=0, label="VQA")
	caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
	gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")



	# image_input.change(
	# lambda: ("", [],"","",""),
	# [],
	# [ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1],
	# queue=False,
	# )
	chat_input.submit(
	inference_chat,
	[
	image_input,
	chat_input,
	],
	[ caption_output,gpt3_output_v1,caption_output_v1],
	)
	clear_button.click(
	lambda: ("", [],"","",""),
	[],
	[chat_input, state,caption_output,gpt3_output_v1,caption_output_v1],
	queue=False,
	)
	submit_button.click(
	inference_chat,
	[
	image_input,
	chat_input,
	],
	[caption_output,gpt3_output_v1,caption_output_v1],
	)
	'''
	cap_submit_button.click(
	caption,
	[
	image_input,

	],
	[caption_output_v1],
	)
	gpt3_submit_button.click(
	gpt3,
	[
	chat_input,
	caption_output ,
	caption_output_v1,
	],
	[gpt3_output_v1],
	)
	'''
	examples=[['bird.jpeg',"How many birds are there in the tree?","2","2","2"],
	['qa9.jpg',"What type of vehicle is being pulled by the horses ?",'carriage','sled','Sled'],
	['upload4.jpg',"What is this old man doing?","fishing","fishing","Fishing"]]
	examples = gr.Examples(
	examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1],
	)

	iface.queue(concurrency_count=1, api_open=False, max_size=10)
	iface.launch(enable_queue=True)