AiOS / detrsmpl /apis /
import cv2
import mmcv
import numpy as np
import torch
from mmcv.parallel import collate
from mmcv.runner import load_checkpoint
from import Compose
from detrsmpl.models.architectures.builder import build_architecture
from detrsmpl.models.backbones.builder import build_backbone
from detrsmpl.utils.demo_utils import box2cs, xywh2xyxy, xyxy2xywh
def init_model(config, checkpoint=None, device='cuda:0'):
"""Initialize a model from config file.
config (str or :obj:`mmcv.Config`): Config file path or the config
checkpoint (str, optional): Checkpoint path. If left as None, the model
will not load any weights.
nn.Module: The constructed model.
(nn.Module, None): The constructed extractor model
if isinstance(config, str):
config = mmcv.Config.fromfile(config)
elif not isinstance(config, mmcv.Config):
raise TypeError('config must be a filename or Config object, '
f'but got {type(config)}') = True
model = build_architecture(config.model)
if checkpoint is not None:
# load model checkpoint
load_checkpoint(model, checkpoint, map_location=device)
# save the config in the model for convenience
model.cfg = config
extractor = None
if config.model.type == 'VideoBodyModelEstimator':
extractor = build_backbone(config.extractor.backbone)
if config.extractor.checkpoint is not None:
# load model checkpoint
load_checkpoint(extractor, config.extractor.checkpoint)
extractor.cfg = config
return model, extractor
class LoadImage:
"""A simple pipeline to load image."""
def __init__(self, color_type='color', channel_order='bgr'):
self.color_type = color_type
self.channel_order = channel_order
def __call__(self, results):
"""Call function to load images into results.
results (dict): A result dict contains the image_path.
dict: ``results`` will be returned containing loaded image.
if isinstance(results['image_path'], str):
results['image_file'] = results['image_path']
img = mmcv.imread(results['image_path'], self.color_type,
elif isinstance(results['image_path'], np.ndarray):
results['image_file'] = ''
if self.color_type == 'color' and self.channel_order == 'rgb':
img = cv2.cvtColor(results['image_path'], cv2.COLOR_BGR2RGB)
img = results['image_path']
raise TypeError('"image_path" must be a numpy array or a str or '
'a pathlib.Path object')
results['img'] = img
return results
def inference_image_based_model(
"""Inference a single image with a list of person bounding boxes.
model (nn.Module): The loaded pose model.
img_or_path (Union[str, np.ndarray]): Image filename or loaded image.
det_results (List(dict)): the item in the dict may contain
'bbox' and/or 'track_id'.
'bbox' (4, ) or (5, ): The person bounding box, which contains
4 box coordinates (and score).
'track_id' (int): The unique id for each human instance.
bbox_thr (float, optional): Threshold for bounding boxes.
Only bboxes with higher scores will be fed into the pose detector.
If bbox_thr is None, ignore it. Defaults to None.
format (str, optional): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
'xyxy' means (left, top, right, bottom),
'xywh' means (left, top, width, height).
list[dict]: Each item in the list is a dictionary,
containing the bbox: (left, top, right, bottom, [score]),
SMPL parameters, vertices, kp3d, and camera.
# only two kinds of bbox format is supported.
assert format in ['xyxy', 'xywh']
mesh_results = []
if len(det_results) == 0:
return []
# Change for-loop preprocess each bbox to preprocess all bboxes at once.
bboxes = np.array([box['bbox'] for box in det_results])
# Select bboxes by score threshold
if bbox_thr is not None:
assert bboxes.shape[1] == 5
valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
bboxes = bboxes[valid_idx]
det_results = [det_results[i] for i in valid_idx]
if format == 'xyxy':
bboxes_xyxy = bboxes
bboxes_xywh = xyxy2xywh(bboxes)
# format is already 'xywh'
bboxes_xywh = bboxes
bboxes_xyxy = xywh2xyxy(bboxes)
# if bbox_thr remove all bounding box
if len(bboxes_xywh) == 0:
return []
cfg = model.cfg
device = next(model.parameters()).device
# build the data pipeline
inference_pipeline = [LoadImage()] + cfg.inference_pipeline
inference_pipeline = Compose(inference_pipeline)
assert len(bboxes[0]) in [4, 5]
batch_data = []
input_size = cfg['img_res']
aspect_ratio = 1 if isinstance(input_size,
int) else input_size[0] / input_size[1]
for i, bbox in enumerate(bboxes_xywh):
center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor=1.25)
# prepare data
data = {
'image_path': img_or_path,
'center': center,
'scale': scale,
'rotation': 0,
'bbox_score': bbox[4] if len(bbox) == 5 else 1,
'sample_idx': i,
data = inference_pipeline(data)
batch_data = collate(batch_data, samples_per_gpu=1)
if next(model.parameters()).is_cuda:
# scatter not work so just move image to cuda device
batch_data['img'] = batch_data['img'].to(device)
# get all img_metas of each bounding box
batch_data['img_metas'] = [
img_metas[0] for img_metas in batch_data['img_metas'].data
# forward the model
with torch.no_grad():
results = model(
for idx in range(len(det_results)):
mesh_result = det_results[idx].copy()
mesh_result['bbox'] = bboxes_xyxy[idx]
mesh_result['camera'] = results['camera'][idx]
mesh_result['smpl_pose'] = results['smpl_pose'][idx]
mesh_result['smpl_beta'] = results['smpl_beta'][idx]
mesh_result['vertices'] = results['vertices'][idx]
mesh_result['keypoints_3d'] = results['keypoints_3d'][idx]
return mesh_results
def inference_video_based_model(model,
"""Inference SMPL parameters from extracted featutres using a video-based
model (nn.Module): The loaded mesh estimation model.
extracted_results (List[List[Dict]]): Multi-frame feature extraction
results stored in a nested list. Each element of the outer list
is the feature extraction results of a single frame, and each
element of the inner list is the feature information of one person,
which contains:
features (ndarray): extracted features
track_id (int): unique id of each person, required when
bbox ((4, ) or (5, )): left, right, top, bottom, [score]
with_track_id: If True, the element in extracted_results is expected to
contain "track_id", which will be used to gather the feature
sequence of a person from multiple frames. Otherwise, the extracted
results in each frame are expected to have a consistent number and
order of identities. Default is True.
causal (bool): If True, the target frame is the first frame in
a sequence. Otherwise, the target frame is in the middle of a
list[dict]: Each item in the list is a dictionary, which contains:
SMPL parameters, vertices, kp3d, and camera.
cfg = model.cfg
device = next(model.parameters()).device
seq_len =
mesh_results = []
# build the data pipeline
inference_pipeline = Compose(cfg.inference_pipeline)
target_idx = 0 if causal else len(extracted_results) // 2
input_features = _gather_input_features(extracted_results)
feature_sequences = _collate_feature_sequence(input_features,
with_track_id, target_idx)
if not feature_sequences:
return mesh_results
batch_data = []
for i, seq in enumerate(feature_sequences):
data = {
'features': seq['features'],
'sample_idx': i,
data = inference_pipeline(data)
batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
if next(model.parameters()).is_cuda:
# scatter not work so just move image to cuda device
batch_data['features'] = batch_data['features'].to(device)
with torch.no_grad():
results = model(features=batch_data['features'],
results['camera'] = results['camera'].reshape(-1, seq_len, 3)
results['smpl_pose'] = results['smpl_pose'].reshape(-1, seq_len, 24, 3, 3)
results['smpl_beta'] = results['smpl_beta'].reshape(-1, seq_len, 10)
results['vertices'] = results['vertices'].reshape(-1, seq_len, 6890, 3)
results['keypoints_3d'] = results['keypoints_3d'].reshape(
-1, seq_len, 17, 3)
for idx in range(len(feature_sequences)):
mesh_result = dict()
mesh_result['camera'] = results['camera'][idx, target_idx]
mesh_result['smpl_pose'] = results['smpl_pose'][idx, target_idx]
mesh_result['smpl_beta'] = results['smpl_beta'][idx, target_idx]
mesh_result['vertices'] = results['vertices'][idx, target_idx]
mesh_result['keypoints_3d'] = results['keypoints_3d'][idx, target_idx]
mesh_result['bbox'] = extracted_results[target_idx][idx]['bbox']
# 'track_id' is not included in results generated by mmdet
if 'track_id' in extracted_results[target_idx][idx].keys():
mesh_result['track_id'] = extracted_results[target_idx][idx][
return mesh_results
def feature_extract(
"""Extract image features with a list of person bounding boxes.
model (nn.Module): The loaded feature extraction model.
img_or_path (Union[str, np.ndarray]): Image filename or loaded image.
det_results (List(dict)): the item in the dict may contain
'bbox' and/or 'track_id'.
'bbox' (4, ) or (5, ): The person bounding box, which contains
4 box coordinates (and score).
'track_id' (int): The unique id for each human instance.
bbox_thr (float, optional): Threshold for bounding boxes.
If bbox_thr is None, ignore it. Defaults to None.
format (str, optional): bbox format. Default: 'xywh'.
'xyxy' means (left, top, right, bottom),
'xywh' means (left, top, width, height).
list[dict]: The bbox & pose info,
containing the bbox: (left, top, right, bottom, [score])
and the features.
# only two kinds of bbox format is supported.
assert format in ['xyxy', 'xywh']
cfg = model.cfg
device = next(model.parameters()).device
feature_results = []
if len(det_results) == 0:
return feature_results
# Change for-loop preprocess each bbox to preprocess all bboxes at once.
bboxes = np.array([box['bbox'] for box in det_results])
assert len(bboxes[0]) in [4, 5]
# Select bboxes by score threshold
if bbox_thr is not None:
assert bboxes.shape[1] == 5
valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
bboxes = bboxes[valid_idx]
det_results = [det_results[i] for i in valid_idx]
# if bbox_thr remove all bounding box
if len(bboxes) == 0:
return feature_results
if format == 'xyxy':
bboxes_xyxy = bboxes
bboxes_xywh = xyxy2xywh(bboxes)
# format is already 'xywh'
bboxes_xywh = bboxes
bboxes_xyxy = xywh2xyxy(bboxes)
# build the data pipeline
extractor_pipeline = [LoadImage()] + cfg.extractor_pipeline
extractor_pipeline = Compose(extractor_pipeline)
batch_data = []
input_size = cfg['img_res']
aspect_ratio = 1 if isinstance(input_size,
int) else input_size[0] / input_size[1]
for i, bbox in enumerate(bboxes_xywh):
center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor=1.25)
# prepare data
data = {
'image_path': img_or_path,
'center': center,
'scale': scale,
'rotation': 0,
'bbox_score': bbox[4] if len(bbox) == 5 else 1,
'sample_idx': i,
data = extractor_pipeline(data)
batch_data = collate(batch_data, samples_per_gpu=1)
if next(model.parameters()).is_cuda:
# scatter not work so just move image to cuda device
batch_data['img'] = batch_data['img'].to(device)
# get all img_metas of each bounding box
batch_data['img_metas'] = [
img_metas[0] for img_metas in batch_data['img_metas'].data
# forward the model
with torch.no_grad():
results = model(batch_data['img'])
if isinstance(results, list) or isinstance(results, tuple):
results = results[-1].mean(dim=-1).mean(dim=-1)
for idx in range(len(det_results)):
feature_result = det_results[idx].copy()
feature_result['bbox'] = bboxes_xyxy[idx]
feature_result['features'] = results[idx].cpu().numpy()
return feature_results
def _gather_input_features(extracted_results):
"""Gather input features.
extracted_results (List[List[Dict]]):
Multi-frame feature extraction results
List[List[dict]]: Multi-frame feature extraction results
stored in a nested list. Each element of the outer list is the
feature extraction results of a single frame, and each element of
the inner list is the extracted results of one person,
which contains:
features (ndarray): extracted features
track_id (int): unique id of each person, required when
sequence_inputs = []
for frame in extracted_results:
frame_inputs = []
for res in frame:
inputs = dict()
if 'features' in res:
inputs['features'] = res['features']
if 'track_id' in res:
inputs['track_id'] = res['track_id']
return sequence_inputs
def _collate_feature_sequence(extracted_features,
"""Reorganize multi-frame feature extraction results into individual
feature sequences.
extracted_features (List[List[Dict]]): Multi-frame feature extraction
results stored in a nested list. Each element of the outer list
is the feature extraction results of a single frame, and each
element of the inner list is the extracted results of one person,
which contains:
features (ndarray): extracted features
track_id (int): unique id of each person, required when
with_track_id (bool): If True, the element in pose_results is expected
to contain "track_id", which will be used to gather the pose
sequence of a person from multiple frames. Otherwise, the pose
results in each frame are expected to have a consistent number and
order of identities. Default is True.
target_frame (int): The index of the target frame. Default: 0.
T = len(extracted_features)
assert T > 0
target_frame = (T + target_frame) % T # convert negative index to positive
N = len(
extracted_features[target_frame]) # use identities in the target frame
if N == 0:
return []
C = extracted_features[target_frame][0]['features'].shape[0]
track_ids = None
if with_track_id:
track_ids = [
res['track_id'] for res in extracted_features[target_frame]
feature_sequences = []
for idx in range(N):
feature_seq = dict()
# gather static information
for k, v in extracted_features[target_frame][idx].items():
if k != 'features':
feature_seq[k] = v
# gather keypoints
if not with_track_id:
feature_seq['features'] = np.stack(
[frame[idx]['features'] for frame in extracted_features])
features = np.zeros((T, C), dtype=np.float32)
features[target_frame] = extracted_features[target_frame][idx][
# find the left most frame containing track_ids[idx]
for frame_idx in range(target_frame - 1, -1, -1):
contains_idx = False
for res in extracted_features[frame_idx]:
if res['track_id'] == track_ids[idx]:
features[frame_idx] = res['features']
contains_idx = True
if not contains_idx:
# replicate the left most frame
features[frame_idx] = features[frame_idx + 1]
# find the right most frame containing track_idx[idx]
for frame_idx in range(target_frame + 1, T):
contains_idx = False
for res in extracted_features[frame_idx]:
if res['track_id'] == track_ids[idx]:
features[frame_idx] = res['features']
contains_idx = True
if not contains_idx:
# replicate the right most frame
features[frame_idx] = features[frame_idx - 1]
# break
feature_seq['features'] = features
return feature_sequences