Spaces:
Runtime error
Runtime error
# Copyright (c) Kakaobrain, Inc. and its affiliates. All Rights Reserved | |
""" | |
V-COCO dataset which returns image_id for evaluation. | |
""" | |
from pathlib import Path | |
from PIL import Image | |
import os | |
import numpy as np | |
import json | |
import torch | |
import torch.utils.data | |
import torchvision | |
from torch.utils.data import Dataset | |
from pycocotools.coco import COCO | |
from pycocotools import mask as coco_mask | |
from hotr.data.datasets import builtin_meta | |
import hotr.data.transforms.transforms as T | |
class VCocoDetection(Dataset): | |
def __init__(self, | |
img_folder, | |
ann_file, | |
all_file, | |
filter_empty_gt=True, | |
transforms=None): | |
self.img_folder = img_folder | |
self.file_meta = dict() | |
self._transforms = transforms | |
self.ann_file = ann_file | |
self.all_file = all_file | |
self.filter_empty_gt = filter_empty_gt | |
# COCO initialize | |
self.coco = COCO(self.all_file) | |
self.COCO_CLASSES = builtin_meta._get_coco_instances_meta()['coco_classes'] | |
self.file_meta['coco_classes'] = self.COCO_CLASSES | |
# Load V-COCO Dataset | |
self.vcoco_all = self.load_vcoco(self.ann_file) | |
# Save COCO annotation data | |
self.image_ids = sorted(list(set(self.vcoco_all[0]['image_id'].reshape(-1)))) | |
# Filter Data | |
if filter_empty_gt: | |
self.filter_image_id() | |
self.img_infos = self.load_annotations() | |
# Refine Data | |
self.save_action_name() | |
self.mapping_inst_action_to_action() | |
self.load_subobj_classes() | |
self.CLASSES = self.act_list | |
############################################################################ | |
# Load V-COCO Dataset | |
############################################################################ | |
def load_vcoco(self, dir_name=None): | |
with open(dir_name, 'rt') as f: | |
vsrl_data = json.load(f) | |
for i in range(len(vsrl_data)): | |
vsrl_data[i]['role_object_id'] = np.array(vsrl_data[i]['role_object_id']).reshape((len(vsrl_data[i]['role_name']),-1)).T | |
for j in ['ann_id', 'label', 'image_id']: | |
vsrl_data[i][j] = np.array(vsrl_data[i][j]).reshape((-1,1)) | |
return vsrl_data | |
############################################################################ | |
# Refine Data | |
############################################################################ | |
def save_action_name(self): | |
self.inst_act_list = list() | |
self.act_list = list() | |
# add instance action human classes | |
self.num_subject_act = 0 | |
for vcoco in self.vcoco_all: | |
self.inst_act_list.append('human_' + vcoco['action_name']) | |
self.num_subject_act += 1 | |
# add instance action object classes | |
for vcoco in self.vcoco_all: | |
if len(vcoco['role_name']) == 3: | |
self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][1]) | |
self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][2]) | |
elif len(vcoco['role_name']) < 2: | |
continue | |
else: | |
self.inst_act_list.append('object_' + vcoco['action_name']+'_'+vcoco['role_name'][-1]) # when only two roles | |
# add action classes | |
for vcoco in self.vcoco_all: | |
if len(vcoco['role_name']) == 3: | |
self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][1]) | |
self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][2]) | |
else: | |
self.act_list.append(vcoco['action_name']+'_'+vcoco['role_name'][-1]) | |
# add to meta | |
self.file_meta['action_classes'] = self.act_list | |
def mapping_inst_action_to_action(self): | |
sub_idx = 0 | |
obj_idx = self.num_subject_act | |
self.sub_label_to_action = list() | |
self.obj_label_to_action = list() | |
for vcoco in self.vcoco_all: | |
role_name = vcoco['role_name'] | |
self.sub_label_to_action.append(sub_idx) | |
if len(role_name) == 3 : | |
self.sub_label_to_action.append(sub_idx) | |
self.obj_label_to_action.append(obj_idx) | |
self.obj_label_to_action.append(obj_idx+1) | |
obj_idx += 2 | |
elif len(role_name) == 2: | |
self.obj_label_to_action.append(obj_idx) | |
obj_idx += 1 | |
else: | |
self.obj_label_to_action.append(0) | |
sub_idx += 1 | |
def load_subobj_classes(self): | |
self.vcoco_labels = dict() | |
for img in self.image_ids: | |
self.vcoco_labels[img] = dict() | |
self.vcoco_labels[img]['boxes'] = np.empty((0, 4), dtype=np.float32) | |
self.vcoco_labels[img]['categories'] = np.empty((0), dtype=np.int32) | |
ann_ids = self.coco.getAnnIds(imgIds=img, iscrowd=None) | |
objs = self.coco.loadAnns(ann_ids) | |
valid_ann_ids = [] | |
for i, obj in enumerate(objs): | |
if 'ignore' in obj and obj['ignore'] == 1: continue | |
x1 = obj['bbox'][0] | |
y1 = obj['bbox'][1] | |
x2 = x1 + np.maximum(0., obj['bbox'][2] - 1.) | |
y2 = y1 + np.maximum(0., obj['bbox'][3] - 1.) | |
if obj['area'] > 0 and x2 > x1 and y2 > y1: | |
bbox = np.array([x1, y1, x2, y2]).reshape(1, -1) | |
cls = obj['category_id'] | |
self.vcoco_labels[img]['boxes'] = np.concatenate([self.vcoco_labels[img]['boxes'], bbox], axis=0) | |
self.vcoco_labels[img]['categories'] = np.concatenate([self.vcoco_labels[img]['categories'], [cls]], axis=0) | |
valid_ann_ids.append(ann_ids[i]) | |
num_valid_objs = len(valid_ann_ids) | |
self.vcoco_labels[img]['agent_actions'] = -np.ones((num_valid_objs, self.num_action()), dtype=np.int32) | |
self.vcoco_labels[img]['obj_actions'] = np.zeros((num_valid_objs, self.num_action()), dtype=np.int32) | |
self.vcoco_labels[img]['role_id'] = -np.ones((num_valid_objs, self.num_action()), dtype=np.int32) | |
for ix, ann_id in enumerate(valid_ann_ids): | |
in_vcoco = np.where(self.vcoco_all[0]['ann_id'] == ann_id)[0] | |
if in_vcoco.size > 0: | |
self.vcoco_labels[img]['agent_actions'][ix, :] = 0 | |
agent_act_id = 0 | |
obj_act_id = -1 | |
for i, x in enumerate(self.vcoco_all): | |
has_label = np.where(np.logical_and(x['ann_id'] == ann_id, x['label'] == 1))[0] | |
if has_label.size > 0: | |
assert has_label.size == 1 | |
rids = x['role_object_id'][has_label] | |
if rids.shape[1] == 3: | |
self.vcoco_labels[img]['agent_actions'][ix, agent_act_id] = 1 | |
self.vcoco_labels[img]['agent_actions'][ix, agent_act_id+1] = 1 | |
agent_act_id += 2 | |
else: | |
self.vcoco_labels[img]['agent_actions'][ix, agent_act_id] = 1 | |
agent_act_id += 1 | |
if rids.shape[1] == 1 : obj_act_id += 1 | |
for j in range(1, rids.shape[1]): | |
obj_act_id += 1 | |
if rids[0, j] == 0: continue # no role | |
aid = np.where(valid_ann_ids == rids[0, j])[0] | |
self.vcoco_labels[img]['role_id'][ix, obj_act_id] = aid | |
self.vcoco_labels[img]['obj_actions'][aid, obj_act_id] = 1 | |
else: | |
rids = x['role_object_id'][0] | |
if rids.shape[0] == 3: | |
agent_act_id += 2 | |
obj_act_id += 2 | |
else: | |
agent_act_id += 1 | |
obj_act_id += 1 | |
############################################################################ | |
# Annotation Loader | |
############################################################################ | |
# >>> 1. instance | |
def load_instance_annotations(self, image_index): | |
num_ann = self.vcoco_labels[image_index]['boxes'].shape[0] | |
inst_action = np.zeros((num_ann, self.num_inst_action()), np.int) | |
inst_bbox = np.zeros((num_ann, 4), dtype=np.float32) | |
inst_category = np.zeros((num_ann, ), dtype=np.int) | |
for idx in range(num_ann): | |
inst_bbox[idx] = self.vcoco_labels[image_index]['boxes'][idx] | |
inst_category[idx]= self.vcoco_labels[image_index]['categories'][idx] #+ 1 # category 1 ~ 81 | |
if inst_category[idx] == 1: | |
act = self.vcoco_labels[image_index]['agent_actions'][idx] | |
inst_action[idx, :self.num_subject_act] = act[np.unique(self.sub_label_to_action, return_index=True)[1]] | |
# when person is the obj | |
act = self.vcoco_labels[image_index]['obj_actions'][idx] # when person is the obj | |
if act.any(): | |
inst_action[idx, self.num_subject_act:] = act[np.nonzero(self.obj_label_to_action)[0]] | |
if inst_action[idx, :self.num_subject_act].sum(axis=-1) < 0: | |
inst_action[idx, :self.num_subject_act] = 0 | |
else: | |
act = self.vcoco_labels[image_index]['obj_actions'][idx] | |
inst_action[idx, self.num_subject_act:] = act[np.nonzero(self.obj_label_to_action)[0]] | |
# >>> For Objects that are in COCO but not in V-COCO, | |
# >>> Human -> [-1 * 26, 0 * 25] | |
# >>> Object -> [0 * 51] | |
# >>> Don't return anything for actions with max 0 or max -1 | |
max_val = inst_action.max(axis=1) | |
if (max_val > 0).sum() == 0: | |
print(f"No Annotations for {image_index}") | |
print(inst_action) | |
print(self.vcoco_labels[image_index]['agent_actions'][idx]) | |
print(self.vcoco_labels[image_index]['obj_actions'][idx]) | |
return inst_bbox[max_val > 0], inst_category[max_val > 0], inst_action[max_val > 0] | |
# >>> 2. pair | |
def load_pair_annotations(self, image_index): | |
num_ann = self.vcoco_labels[image_index]['boxes'].shape[0] | |
pair_action = np.zeros((0, self.num_action()), np.int) | |
pair_bbox = np.zeros((0, 8), dtype=np.float32) | |
pair_target = np.zeros((0, ), dtype=np.int) | |
for idx in range(num_ann): | |
h_box = self.vcoco_labels[image_index]['boxes'][idx] | |
h_cat = self.vcoco_labels[image_index]['categories'][idx] | |
if h_cat != 1 : continue # human_id = 1 | |
h_act = self.vcoco_labels[image_index]['agent_actions'][idx] | |
if np.any((h_act==-1)) : continue | |
o_act = dict() | |
for aid in range(self.num_action()): | |
if h_act[aid] == 0 : continue | |
o_id = self.vcoco_labels[image_index]['role_id'][idx, aid] | |
if o_id not in o_act : o_act[o_id] = list() | |
o_act[o_id].append(aid) | |
for o_id in o_act.keys(): | |
if o_id == -1: | |
o_box = -np.ones((4, )) | |
o_cat = -1 # target is background | |
else: | |
o_box = self.vcoco_labels[image_index]['boxes'][o_id] | |
o_cat = self.vcoco_labels[image_index]['categories'][o_id] # category 0 ~ 80 | |
box = np.concatenate([h_box, o_box]).astype(np.float32) | |
act = np.zeros((1, self.num_action()), np.int) | |
tar = np.zeros((1, ), np.int) | |
tar[0] = o_cat #+ 1 # category 1 ~ 81 | |
for o_aid in o_act[o_id] : act[0, o_aid] = 1 | |
pair_action = np.concatenate([pair_action, act], axis=0) | |
pair_bbox = np.concatenate([pair_bbox, np.expand_dims(box, axis=0)], axis=0) | |
pair_target = np.concatenate([pair_target, tar], axis=0) | |
return pair_bbox, pair_action, pair_target | |
# >>> 3. image infos | |
def load_annotations(self): | |
img_infos = [] | |
for i in self.image_ids: | |
info = self.coco.loadImgs([i])[0] | |
img_infos.append(info) | |
return img_infos | |
############################################################################ | |
# Check Method | |
############################################################################ | |
def sum_action_ann_for_id(self, find_idx): | |
sum = 0 | |
for action_ann in self.vcoco_all: | |
img_ids = action_ann['image_id'] | |
img_labels = action_ann['label'] | |
final_inds = img_ids[img_labels == 1] | |
if (find_idx in final_inds): | |
sum += 1 | |
# sum of class-wise existence | |
return (sum > 0) | |
def filter_image_id(self): | |
empty_gt_list = [] | |
for img_id in self.image_ids: | |
if not self.sum_action_ann_for_id(img_id): | |
empty_gt_list.append(img_id) | |
for remove_id in empty_gt_list: | |
rm_idx = self.image_ids.index(remove_id) | |
self.image_ids.remove(remove_id) | |
############################################################################ | |
# Preprocessing | |
############################################################################ | |
def prepare_img(self, idx): | |
img_info = self.img_infos[idx] | |
image = Image.open(os.path.join(self.img_folder, img_info['file_name'])).convert('RGB') | |
target = self.get_ann_info(idx) | |
w, h = image.size | |
target["orig_size"] = torch.as_tensor([int(h), int(w)]) | |
target["size"] = torch.as_tensor([int(h), int(w)]) | |
if self._transforms is not None: | |
img, target = self._transforms(image, target) # "size" gets converted here | |
return img, target | |
############################################################################ | |
# Get Method | |
############################################################################ | |
def __getitem__(self, idx): | |
img, target = self.prepare_img(idx) | |
return img, target | |
def __len__(self): | |
return len(self.image_ids) | |
def get_human_label_idx(self): | |
return self.sub_label_to_action | |
def get_object_label_idx(self): | |
return self.obj_label_to_action | |
def get_image_ids(self): | |
return self.image_ids | |
def get_categories(self): | |
return self.COCO_CLASSES | |
def get_inst_action(self): | |
return self.inst_act_list | |
def get_actions(self): | |
return self.act_list | |
def get_human_action(self): | |
return self.inst_act_list[:self.num_subject_act] | |
def get_object_action(self): | |
return self.inst_act_list[self.num_subject_act:] | |
def get_ann_info(self, idx): | |
img_idx = int(self.image_ids[idx]) | |
# load each annotation | |
inst_bbox, inst_label, inst_actions = self.load_instance_annotations(img_idx) | |
pair_bbox, pair_actions, pair_targets = self.load_pair_annotations(img_idx) | |
sample = { | |
'image_id' : torch.tensor([img_idx]), | |
'boxes': torch.as_tensor(inst_bbox, dtype=torch.float32), | |
'labels': torch.tensor(inst_label, dtype=torch.int64), | |
'inst_actions': torch.tensor(inst_actions, dtype=torch.int64), | |
'pair_boxes': torch.as_tensor(pair_bbox, dtype=torch.float32), | |
'pair_actions': torch.tensor(pair_actions, dtype=torch.int64), | |
'pair_targets': torch.tensor(pair_targets, dtype=torch.int64), | |
} | |
return sample | |
############################################################################ | |
# Number Method | |
############################################################################ | |
def num_category(self): | |
return len(self.COCO_CLASSES) | |
def num_action(self): | |
return len(self.act_list) | |
def num_inst_action(self): | |
return len(self.inst_act_list) | |
def num_human_act(self): | |
return len(self.inst_act_list[:self.num_subject_act]) | |
def num_object_act(self): | |
return len(self.inst_act_list[self.num_subject_act:]) | |
def make_hoi_transforms(image_set): | |
normalize = T.Compose([ | |
T.ToTensor(), | |
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) | |
]) | |
scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] | |
if image_set == 'train': | |
return T.Compose([ | |
T.RandomHorizontalFlip(), | |
T.ColorJitter(.4, .4, .4), | |
T.RandomSelect( | |
T.RandomResize(scales, max_size=1333), | |
T.Compose([ | |
T.RandomResize([400, 500, 600]), | |
T.RandomSizeCrop(384, 600), | |
T.RandomResize(scales, max_size=1333), | |
]) | |
), | |
normalize, | |
]) | |
if image_set == 'val': | |
return T.Compose([ | |
T.RandomResize([800], max_size=1333), | |
normalize, | |
]) | |
if image_set == 'test': | |
return T.Compose([ | |
T.RandomResize([800], max_size=1333), | |
normalize, | |
]) | |
raise ValueError(f'unknown {image_set}') | |
def build(image_set, args): | |
root = Path(args.data_path) | |
assert root.exists(), f'provided V-COCO path {root} does not exist' | |
PATHS = { | |
"train": (root / "coco/images/train2014/", root / "data/vcoco" / 'vcoco_trainval.json'), | |
"val": (root / "coco/images/val2014", root / "data/vcoco" / 'vcoco_test.json'), | |
"test": (root / "coco/images/val2014", root / "data/vcoco" / 'vcoco_test.json'), | |
} | |
img_folder, ann_file = PATHS[image_set] | |
all_file = root / "data/instances_vcoco_all_2014.json" | |
dataset = VCocoDetection( | |
img_folder = img_folder, | |
ann_file = ann_file, | |
all_file = all_file, | |
filter_empty_gt=True, | |
transforms = make_hoi_transforms(image_set) | |
) | |
dataset.file_meta['dataset_file'] = args.dataset_file | |
dataset.file_meta['image_set'] = image_set | |
return dataset | |