Salesforce/instructblip-flan-t5-xxl · Scoring caption with likelihood and the prompt "Write a description for the photo."

Hi,
I'm trying to compute the log-likelihood of some captions given an image and a prompt to do retrieval with InstructBLIP.
I'm attaching the most important parts of the code.

def compute_avg_likelihood(scores, ids, mask):
    # Compute log likelihood given scores, ids and mask
    # Softmax
    scores = torch.nn.functional.softmax(scores, dim=-1)
    
    # Gather ids to get the probability of each token
    logits = torch.gather(scores, 2, einops.repeat(ids.unsqueeze(dim=2),
                                                   "B L 1 -> B L H",
                                                   H=scores.shape[2]))[:, :, 0]
    # Do logarithm
    logits = torch.log(logits)
    # Exclude padding values
    logits = logits * mask
   # Compute avg likelihood (the number of non zeros in the mask corresponds to seq length)
    logits = torch.sum(logits, dim=1) / torch.count_nonzero(mask, dim=1)
    return logits

processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xl").to(
        torch.float16)
model.to(custom_args.device)
model = model.eval()
prompt = "Write a description for the photo."

for it, sample in enumerate(tqdm(compositional_dataset_dict[dataset_name],
                                     mininterval=1, total=len(compositional_dataset_dict[dataset_name]))):
        # Read images, true captions and false captions
        images = sample["raw_image"]
        true_caption = sample["raw_caption"]
        false_caption = sample["raw_caption_false"]
        
        true_caption_dict = processor.tokenizer(true_caption, truncation=True, padding=True)
        false_caption_dict = processor.tokenizer(false_caption, truncation=True, padding=True)
        
        # Get input_ids and masks for the true caption and false caption
        true_caption_id = torch.Tensor(true_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)
        false_caption_id = torch.Tensor(false_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)
        true_caption_mask = torch.Tensor(true_caption_dict["attention_mask"]).to(torch.int64).to(custom_args.device)
        false_caption_mask = torch.Tensor(false_caption_dict["attention_mask"]).to(torch.int64).to(custom_args.device)
        
        # Pad sequences and masks
        if true_caption_id.shape[1] > false_caption_id.shape[1]:
            offset = true_caption_id.shape[1] - false_caption_id.shape[1]
            padding = torch.zeros(true_caption_id.shape[0], offset).to(custom_args.device).to(torch.int64)
            false_caption_id = torch.cat([false_caption_id, padding], dim=1)
            false_caption_mask = torch.cat([false_caption_mask, padding], dim=1)
        elif true_caption_id.shape[1] < false_caption_id.shape[1]:
            offset = false_caption_id.shape[1] - true_caption_id.shape[1]
            padding = torch.zeros(true_caption_id.shape[0], offset).to(custom_args.device).to(torch.int64)
            true_caption_id = torch.cat([true_caption_id, padding], dim=1)
            true_caption_mask = torch.cat([true_caption_mask, padding], dim=1)
        else:
            None
         
        # Replicate the prompt for the number of images
        true_prompt = [prompt for _ in range(len(images))]

        # Prepare input
        inputs = processor(images=images,
                           text=true_prompt,
                           return_tensors="pt").to(custom_args.device)
        
        with torch.no_grad():
            # Compute outputs:
            # Min_new_tokens = max_new_tokens to constrain the output to be of the same dimension as the true_caption_id  and 
            # false_caption_id
            outputs = model.generate(**inputs,
                                     output_logits=True,
                                    return_dict_in_generate=True,
                                    min_new_tokens=true_caption_id.shape[1],
                                    max_new_tokens=true_caption_id.shape[1])
        
        # Extract logits from the outputs
        logits = torch.cat([outputs.logits[i].unsqueeze(dim=1)
                            for i in range(len(outputs.logits))], dim=1)
        
        # Compute scores for positive caption
        positive_scores = np.array(compute_avg_likelihood(logits, true_caption_id, true_caption_mask).cpu())
        # Compute scores for negative caption
        negative_scores = np.array(compute_avg_likelihood(logits, false_caption_id, false_caption_mask).cpu())

In particular:
logits taken from the outputs have a dimension of N-1 where N is the dimension of the output sequence. The Begin Of Sentence token is excluded from logits return? Looking at the internal code it seems that the Begin Of Sentence token is excluded from logits return

the tokenization returns input ids for the caption without the Begin Of Sentence token:

true_caption_dict = processor.tokenizer(true_caption, truncation=True, padding=True)
true_caption_id = torch.Tensor(true_caption_dict["input_ids"]).to(torch.int64).to(custom_args.device)

It seems that the tokenizer excludes the Begin Of Sentence token as well as logits from generate()
I'm not able to find where the bug is.

I appreciate a lot an answer, thanks