HI @gchhablani . How are you?
I am doing visualBERT VQA evaluation but the target tensor is always 0. Could share the pytorch dataset for local files or update the config. json? Thank you
Here is my code:

===========================================================================================

Read questions

import json

Checking path and Opening JSON file,

f_read_questions = open('./vqa2/v2_OpenEnded_mscoco_val2014_questions.json')

Return JSON object as dictionary

data_questions = json.load(f_read_questions)
print(data_questions.keys())

questions = data_questions['questions']
print("Number of questions:", len(questions))

from os import listdir
from os.path import isfile, join

root at which all images are stored

root = './multimodal_data/vqa2/val2014'
file_names = [f for f in listdir(root) if isfile(join(root, f))]

import re
from typing import Optional

filename_re = re.compile(r".*(\d{12}).((jpg)|(png))")

source: https://github.com/allenai/allennlp-models/blob/a36aed540e605c4293c25f73d6674071ca9edfc3/allennlp_models/vision/dataset_readers/vqav2.py#L141

def id_from_filename(filename: str) -> Optional[int]:
match = filename_re.fullmatch(filename)
if match is None:
return None
return int(match.group(1))

filename_to_id = {root + "/" + file: id_from_filename(file) for file in file_names}
id_to_filename = {v: k for k, v in filename_to_id.items()}

from os import listdir
from os.path import isfile, join

root at which all images are stored

root = './multimodal_data/vqa2/val2014'
file_names = [f for f in listdir(root) if isfile(join(root, f))]

Read annotations

f_read_annotations = open("./vqa2/v2_mscoco_val2014_annotations.json")

Return JSON object as dictionary

data_annotations = json.load(f_read_annotations)
print(data_annotations.keys())

show answwers

annotations = data_annotations['annotations']
print("Number of annotations:", len(annotations))

from transformers import VisualBertConfig, VisualBertModel
config = VisualBertConfig.from_pretrained("./pretrained/visualBERT/visualbert-vqa")

from tqdm.notebook import tqdm

def get_score(count: int) -> float:
return min(1.0, count / 3)

for annotation in tqdm(annotations):
answers = annotation['answers']
answer_count = {}
for answer in answers:
answer_ = answer["answer"]
answer_count[answer_] = answer_count.get(answer_, 0) + 1
labels = []
scores = []
for answer in answer_count:
if answer not in list(config.label2id.keys()):
continue
labels.append(config.label2id[answer])
score = get_score(answer_count[answer])
scores.append(score)
annotation['labels'] = labels
annotation['scores'] = scores

class VQADataset(torch.utils.data.Dataset):
"""VQA (v2) dataset."""

def __init__(self, questions, annotations, image_preprocess):
    self.questions = questions
    self.annotations = annotations
    self.image_preprocess = image_preprocess

def __len__(self):
    return len(self.annotations)

def __getitem__(self, idx):
    # get image + text
    annotation = self.annotations[idx]
    questions = self.questions[idx]
    image = id_to_filename[annotation['image_id']]
    text = questions['question']

    inputs = tokenizer(
                text,
                padding="max_length",
                max_length=40,
                truncation=True,
                return_token_type_ids=True,
                return_attention_mask=True,
                add_special_tokens=True,
                return_tensors="pt",
            )

     #faster-rcnn
    images, sizes, scales_yx = self.image_preprocess(image)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )

    features = output_dict.get("roi_features")

    inputs.update(
        {"input_ids": inputs.input_ids,
         "attention_mask": inputs.attention_mask,
         "token_type_ids": inputs.token_type_ids,
         "visual_embeds": features,
         "visual_attention_mask": torch.ones(features.shape[:-1], dtype=torch.float),
         "visual_token_type_ids": torch.ones(features .shape[:-1], dtype=torch.long),
         # "output_attentions": False
         }
    )

  # remove batch dimension
    for k, v in inputs.items():
        inputs[k] = v.squeeze()

   # add labels
    labels = annotation['labels']
    scores = torch.tensor(annotation['scores'])

    targets = torch.zeros(len(config.id2label), dtype=torch.float)
    for label, score in zip(labels, scores):
        targets[label] = score
    inputs["labels"] = targets # get the index of the highest score as target

    # based on: https://github.com/dandelin/ViLT/blob/762fd3975c180db6fc88f577cf39549983fa373a/vilt/modules/objectives.py#L301
    return inputs

if name == 'main':
from visual_bert.processing_image import Preprocess
from visual_bert.visualizing_image import SingleImageViz
from visual_bert.modeling_frcnn import GeneralizedRCNN
from visual_bert.utils import Config

frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)

from transformers import VisualBertForQuestionAnswering, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("./pretrained/bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("./pretrained/visualBERT/visualbert-vqa")

# if cfg.use_multi_gpu:
# model = nn.DataParallel(model)
# model = model.to(device=device)
model.to(device)
model.eval()

dataset = VQADataset(questions=questions[:100],
                     annotations=annotations[:100],
                     image_preprocess=image_preprocess)

test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

correct = 0.0
total = 0

loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

for batch in tqdm(test_dataloader):
    # with torch.no_grad():
   
        batch = {k: v.to(device) for k, v in batch.items()}
        # here is the probelm
        print(batch["labels"])

        # forward pass
        outputs = model(**batch)
        logits= outputs.logits
        _, pre = torch.max(logits.data, 1)
        _, target = torch.max(batch["labels"].data, 1)
        print("prediction:", pre)
        print("target:", target)

        correct += (pre == target).sum().item()
        total = total + 1
        print(total)
        print("==============================================================")

final_acc = correct / float(len(test_dataloader.dataset))
print('Accuracy of test: %f %%' % (100 * float(final_acc)))

uclanlp
/

visualbert-vqa

visualBERT config.json

===========================================================================================

Read questions

Checking path and Opening JSON file,

Return JSON object as dictionary

root at which all images are stored

source: https://github.com/allenai/allennlp-models/blob/a36aed540e605c4293c25f73d6674071ca9edfc3/allennlp_models/vision/dataset_readers/vqav2.py#L141

root at which all images are stored

Read annotations

Return JSON object as dictionary

show answwers