Scoring caption with likelihood and the prompt "Write a description for the photo."
#5
by
FiorenzoParascandolo
- opened
Hi,
I'm trying to compute the log-likelihood of some captions given an image and a prompt to do retrieval with InstructBLIP.
I'm attaching the most important parts of the code.
def compute_avg_likelihood(scores, ids, mask):
# Compute log likelihood given scores, ids and mask
# Softmax
scores = torch.nn.functional.softmax(scores, dim=-1)
# Gather ids to get the probability of each token
logits = torch.gather(scores, 2, einops.repeat(ids.unsqueeze(dim=2),
"B L 1 -> B L H",
H=scores.shape[2]))[:, :, 0]
# Do logarithm
logits = torch.log(logits)
# Exclude padding values
logits = logits * mask
# Compute avg likelihood (the number of non zeros in the mask corresponds to seq length)
logits = torch.sum(logits, dim=1) / torch.count_nonzero(mask, dim=1)
return logits
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xl").to(
torch.float16)
model.to(custom_args.device)
model = model.eval()
prompt = "Write a description for the photo."
for it, sample in enumerate(tqdm(compositional_dataset_dict[dataset_name],
mininterval=1, total=len(compositional_dataset_dict[dataset_name]))):
# Read images, true captions and false captions
images = sample["raw_image"]
true_caption = sample["raw_caption"]
false_caption = sample["raw_caption_false"]
true_caption_dict = processor.tokenizer(true_caption, truncation=True, padding=True)
false_caption_dict = processor.tokenizer(false_caption, truncation=True, padding=True)
# Get input_ids and masks for the true caption and false caption
true_caption_id = torch.Tensor(true_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)
false_caption_id = torch.Tensor(false_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)
true_caption_mask = torch.Tensor(true_caption_dict["attention_mask"]).to(torch.int64).to(custom_args.device)
false_caption_mask = torch.Tensor(false_caption_dict["attention_mask"]).to(torch.int64).to(custom_args.device)
# Pad sequences and masks
if true_caption_id.shape[1] > false_caption_id.shape[1]:
offset = true_caption_id.shape[1] - false_caption_id.shape[1]
padding = torch.zeros(true_caption_id.shape[0], offset).to(custom_args.device).to(torch.int64)
false_caption_id = torch.cat([false_caption_id, padding], dim=1)
false_caption_mask = torch.cat([false_caption_mask, padding], dim=1)
elif true_caption_id.shape[1] < false_caption_id.shape[1]:
offset = false_caption_id.shape[1] - true_caption_id.shape[1]
padding = torch.zeros(true_caption_id.shape[0], offset).to(custom_args.device).to(torch.int64)
true_caption_id = torch.cat([true_caption_id, padding], dim=1)
true_caption_mask = torch.cat([true_caption_mask, padding], dim=1)
else:
None
# Replicate the prompt for the number of images
true_prompt = [prompt for _ in range(len(images))]
# Prepare input
inputs = processor(images=images,
text=true_prompt,
return_tensors="pt").to(custom_args.device)
with torch.no_grad():
# Compute outputs:
# Min_new_tokens = max_new_tokens to constrain the output to be of the same dimension as the true_caption_id and
# false_caption_id
outputs = model.generate(**inputs,
output_logits=True,
return_dict_in_generate=True,
min_new_tokens=true_caption_id.shape[1],
max_new_tokens=true_caption_id.shape[1])
# Extract logits from the outputs
logits = torch.cat([outputs.logits[i].unsqueeze(dim=1)
for i in range(len(outputs.logits))], dim=1)
# Compute scores for positive caption
positive_scores = np.array(compute_avg_likelihood(logits, true_caption_id, true_caption_mask).cpu())
# Compute scores for negative caption
negative_scores = np.array(compute_avg_likelihood(logits, false_caption_id, false_caption_mask).cpu())
In particular:
logits taken from the outputs have a dimension of N-1 where N is the dimension of the output sequence. The Begin Of Sentence token is excluded from logits return? Looking at the internal code it seems that the Begin Of Sentence token is excluded from logits return
the tokenization returns input ids for the caption without the Begin Of Sentence token:
true_caption_dict = processor.tokenizer(true_caption, truncation=True, padding=True)
true_caption_id = torch.Tensor(true_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)
It seems that the tokenizer excludes the Begin Of Sentence token as well as logits from generate()
I'm not able to find where the bug is.
I appreciate a lot an answer, thanks