visualBERT config.json
HI
@gchhablani
. How are you?
I am doing visualBERT VQA evaluation but the target tensor is always 0. Could share the pytorch dataset for local files or update the config. json? Thank you
Here is my code:
===========================================================================================
Read questions
import json
Checking path and Opening JSON file,
f_read_questions = open('./vqa2/v2_OpenEnded_mscoco_val2014_questions.json')
Return JSON object as dictionary
data_questions = json.load(f_read_questions)
print(data_questions.keys())
questions = data_questions['questions']
print("Number of questions:", len(questions))
from os import listdir
from os.path import isfile, join
root at which all images are stored
root = './multimodal_data/vqa2/val2014'
file_names = [f for f in listdir(root) if isfile(join(root, f))]
import re
from typing import Optional
filename_re = re.compile(r".*(\d{12}).((jpg)|(png))")
source: https://github.com/allenai/allennlp-models/blob/a36aed540e605c4293c25f73d6674071ca9edfc3/allennlp_models/vision/dataset_readers/vqav2.py#L141
def id_from_filename(filename: str) -> Optional[int]:
match = filename_re.fullmatch(filename)
if match is None:
return None
return int(match.group(1))
filename_to_id = {root + "/" + file: id_from_filename(file) for file in file_names}
id_to_filename = {v: k for k, v in filename_to_id.items()}
from os import listdir
from os.path import isfile, join
root at which all images are stored
root = './multimodal_data/vqa2/val2014'
file_names = [f for f in listdir(root) if isfile(join(root, f))]
Read annotations
f_read_annotations = open("./vqa2/v2_mscoco_val2014_annotations.json")
Return JSON object as dictionary
data_annotations = json.load(f_read_annotations)
print(data_annotations.keys())
show answwers
annotations = data_annotations['annotations']
print("Number of annotations:", len(annotations))
from transformers import VisualBertConfig, VisualBertModel
config = VisualBertConfig.from_pretrained("./pretrained/visualBERT/visualbert-vqa")
from tqdm.notebook import tqdm
def get_score(count: int) -> float:
return min(1.0, count / 3)
for annotation in tqdm(annotations):
answers = annotation['answers']
answer_count = {}
for answer in answers:
answer_ = answer["answer"]
answer_count[answer_] = answer_count.get(answer_, 0) + 1
labels = []
scores = []
for answer in answer_count:
if answer not in list(config.label2id.keys()):
continue
labels.append(config.label2id[answer])
score = get_score(answer_count[answer])
scores.append(score)
annotation['labels'] = labels
annotation['scores'] = scores
class VQADataset(torch.utils.data.Dataset):
"""VQA (v2) dataset."""
def __init__(self, questions, annotations, image_preprocess):
self.questions = questions
self.annotations = annotations
self.image_preprocess = image_preprocess
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
# get image + text
annotation = self.annotations[idx]
questions = self.questions[idx]
image = id_to_filename[annotation['image_id']]
text = questions['question']
inputs = tokenizer(
text,
padding="max_length",
max_length=40,
truncation=True,
return_token_type_ids=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
#faster-rcnn
images, sizes, scales_yx = self.image_preprocess(image)
output_dict = frcnn(
images,
sizes,
scales_yx=scales_yx,
padding="max_detections",
max_detections=frcnn_cfg.max_detections,
return_tensors="pt",
)
features = output_dict.get("roi_features")
inputs.update(
{"input_ids": inputs.input_ids,
"attention_mask": inputs.attention_mask,
"token_type_ids": inputs.token_type_ids,
"visual_embeds": features,
"visual_attention_mask": torch.ones(features.shape[:-1], dtype=torch.float),
"visual_token_type_ids": torch.ones(features .shape[:-1], dtype=torch.long),
# "output_attentions": False
}
)
# remove batch dimension
for k, v in inputs.items():
inputs[k] = v.squeeze()
# add labels
labels = annotation['labels']
scores = torch.tensor(annotation['scores'])
targets = torch.zeros(len(config.id2label), dtype=torch.float)
for label, score in zip(labels, scores):
targets[label] = score
inputs["labels"] = targets # get the index of the highest score as target
# based on: https://github.com/dandelin/ViLT/blob/762fd3975c180db6fc88f577cf39549983fa373a/vilt/modules/objectives.py#L301
return inputs
if name == 'main':
from visual_bert.processing_image import Preprocess
from visual_bert.visualizing_image import SingleImageViz
from visual_bert.modeling_frcnn import GeneralizedRCNN
from visual_bert.utils import Config
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)
from transformers import VisualBertForQuestionAnswering, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("./pretrained/bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("./pretrained/visualBERT/visualbert-vqa")
# if cfg.use_multi_gpu:
# model = nn.DataParallel(model)
# model = model.to(device=device)
model.to(device)
model.eval()
dataset = VQADataset(questions=questions[:100],
annotations=annotations[:100],
image_preprocess=image_preprocess)
test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
correct = 0.0
total = 0
loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)
for batch in tqdm(test_dataloader):
# with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# here is the probelm
print(batch["labels"])
# forward pass
outputs = model(**batch)
logits= outputs.logits
_, pre = torch.max(logits.data, 1)
_, target = torch.max(batch["labels"].data, 1)
print("prediction:", pre)
print("target:", target)
correct += (pre == target).sum().item()
total = total + 1
print(total)
print("==============================================================")
final_acc = correct / float(len(test_dataloader.dataset))
print('Accuracy of test: %f %%' % (100 * float(final_acc)))