AiOS / mmcv /tests /test_runner /test_optimizer.py
ttxskk
update
d7e58f0
raw
history blame
23.9 kB
# Copyright (c) OpenMMLab. All rights reserved.
import sys
import warnings
from unittest.mock import MagicMock
import pytest
import torch
import torch.nn as nn
from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
from mmcv.runner.optimizer import build_optimizer, build_optimizer_constructor
from mmcv.runner.optimizer.builder import TORCH_OPTIMIZERS
from mmcv.utils.ext_loader import check_ops_exist
OPS_AVAILABLE = check_ops_exist()
if not OPS_AVAILABLE:
sys.modules['mmcv.ops'] = MagicMock(
DeformConv2d=dict, ModulatedDeformConv2d=dict)
class SubModel(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
self.gn = nn.GroupNorm(2, 2)
self.param1 = nn.Parameter(torch.ones(1))
def forward(self, x):
return x
class ExampleModel(nn.Module):
def __init__(self):
super().__init__()
self.param1 = nn.Parameter(torch.ones(1))
self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
self.bn = nn.BatchNorm2d(2)
self.sub = SubModel()
if OPS_AVAILABLE:
from mmcv.ops import DeformConv2dPack
self.dcn = DeformConv2dPack(
3, 4, kernel_size=3, deformable_groups=1)
def forward(self, x):
return x
class ExampleDuplicateModel(nn.Module):
def __init__(self):
super().__init__()
self.param1 = nn.Parameter(torch.ones(1))
self.conv1 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
self.conv2 = nn.Sequential(nn.Conv2d(4, 2, kernel_size=1))
self.bn = nn.BatchNorm2d(2)
self.sub = SubModel()
self.conv3 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
self.conv3[0] = self.conv1[0]
if OPS_AVAILABLE:
from mmcv.ops import DeformConv2dPack
self.dcn = DeformConv2dPack(
3, 4, kernel_size=3, deformable_groups=1)
def forward(self, x):
return x
class PseudoDataParallel(nn.Module):
def __init__(self):
super().__init__()
self.module = ExampleModel()
def forward(self, x):
return x
base_lr = 0.01
base_wd = 0.0001
momentum = 0.9
def check_default_optimizer(optimizer, model, prefix=''):
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == base_wd
param_groups = optimizer.param_groups[0]
if OPS_AVAILABLE:
param_names = [
'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias', 'dcn.weight',
'dcn.conv_offset.weight', 'dcn.conv_offset.bias'
]
else:
param_names = [
'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias'
]
param_dict = dict(model.named_parameters())
assert len(param_groups['params']) == len(param_names)
for i in range(len(param_groups['params'])):
assert torch.equal(param_groups['params'][i],
param_dict[prefix + param_names[i]])
def check_sgd_optimizer(optimizer,
model,
prefix='',
bias_lr_mult=1,
bias_decay_mult=1,
norm_decay_mult=1,
dwconv_decay_mult=1,
dcn_offset_lr_mult=1,
bypass_duplicate=False):
param_groups = optimizer.param_groups
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == base_wd
model_parameters = list(model.parameters())
assert len(param_groups) == len(model_parameters)
for i, param in enumerate(model_parameters):
param_group = param_groups[i]
assert torch.equal(param_group['params'][0], param)
assert param_group['momentum'] == momentum
# param1
param1 = param_groups[0]
assert param1['lr'] == base_lr
assert param1['weight_decay'] == base_wd
# conv1.weight
conv1_weight = param_groups[1]
assert conv1_weight['lr'] == base_lr
assert conv1_weight['weight_decay'] == base_wd
# conv2.weight
conv2_weight = param_groups[2]
assert conv2_weight['lr'] == base_lr
assert conv2_weight['weight_decay'] == base_wd
# conv2.bias
conv2_bias = param_groups[3]
assert conv2_bias['lr'] == base_lr * bias_lr_mult
assert conv2_bias['weight_decay'] == base_wd * bias_decay_mult
# bn.weight
bn_weight = param_groups[4]
assert bn_weight['lr'] == base_lr
assert bn_weight['weight_decay'] == base_wd * norm_decay_mult
# bn.bias
bn_bias = param_groups[5]
assert bn_bias['lr'] == base_lr
assert bn_bias['weight_decay'] == base_wd * norm_decay_mult
# sub.param1
sub_param1 = param_groups[6]
assert sub_param1['lr'] == base_lr
assert sub_param1['weight_decay'] == base_wd
# sub.conv1.weight
sub_conv1_weight = param_groups[7]
assert sub_conv1_weight['lr'] == base_lr
assert sub_conv1_weight['weight_decay'] == base_wd * dwconv_decay_mult
# sub.conv1.bias
sub_conv1_bias = param_groups[8]
assert sub_conv1_bias['lr'] == base_lr * bias_lr_mult
assert sub_conv1_bias['weight_decay'] == base_wd * dwconv_decay_mult
# sub.gn.weight
sub_gn_weight = param_groups[9]
assert sub_gn_weight['lr'] == base_lr
assert sub_gn_weight['weight_decay'] == base_wd * norm_decay_mult
# sub.gn.bias
sub_gn_bias = param_groups[10]
assert sub_gn_bias['lr'] == base_lr
assert sub_gn_bias['weight_decay'] == base_wd * norm_decay_mult
if torch.cuda.is_available():
dcn_conv_weight = param_groups[11]
assert dcn_conv_weight['lr'] == base_lr
assert dcn_conv_weight['weight_decay'] == base_wd
dcn_offset_weight = param_groups[12]
assert dcn_offset_weight['lr'] == base_lr * dcn_offset_lr_mult
assert dcn_offset_weight['weight_decay'] == base_wd
dcn_offset_bias = param_groups[13]
assert dcn_offset_bias['lr'] == base_lr * dcn_offset_lr_mult
assert dcn_offset_bias['weight_decay'] == base_wd
def test_default_optimizer_constructor():
model = ExampleModel()
with pytest.raises(TypeError):
# optimizer_cfg must be a dict
optimizer_cfg = []
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
optim_constructor(model)
with pytest.raises(TypeError):
# paramwise_cfg must be a dict or None
optimizer_cfg = dict(lr=0.0001)
paramwise_cfg = ['error']
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg, paramwise_cfg)
optim_constructor(model)
with pytest.raises(ValueError):
# bias_decay_mult/norm_decay_mult is specified but weight_decay is None
optimizer_cfg = dict(lr=0.0001, weight_decay=None)
paramwise_cfg = dict(bias_decay_mult=1, norm_decay_mult=1)
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg, paramwise_cfg)
optim_constructor(model)
# basic config with ExampleModel
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
optimizer = optim_constructor(model)
check_default_optimizer(optimizer, model)
# basic config with pseudo data parallel
model = PseudoDataParallel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = None
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
optimizer = optim_constructor(model)
check_default_optimizer(optimizer, model, prefix='module.')
# basic config with DataParallel
if torch.cuda.is_available():
model = torch.nn.DataParallel(ExampleModel())
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = None
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
optimizer = optim_constructor(model)
check_default_optimizer(optimizer, model, prefix='module.')
# Empty paramwise_cfg with ExampleModel
model = ExampleModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict()
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
check_default_optimizer(optimizer, model)
# Empty paramwise_cfg with ExampleModel and no grad
model = ExampleModel()
for param in model.parameters():
param.requires_grad = False
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict()
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
optimizer = optim_constructor(model)
check_default_optimizer(optimizer, model)
# paramwise_cfg with ExampleModel
model = ExampleModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
check_sgd_optimizer(optimizer, model, **paramwise_cfg)
# paramwise_cfg with ExampleModel, weight decay is None
model = ExampleModel()
optimizer_cfg = dict(type='Rprop', lr=base_lr)
paramwise_cfg = dict(bias_lr_mult=2)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
param_groups = optimizer.param_groups
assert isinstance(optimizer, torch.optim.Rprop)
assert optimizer.defaults['lr'] == base_lr
model_parameters = list(model.parameters())
assert len(param_groups) == len(model_parameters)
for i, param in enumerate(model_parameters):
param_group = param_groups[i]
assert torch.equal(param_group['params'][0], param)
# param1
assert param_groups[0]['lr'] == base_lr
# conv1.weight
assert param_groups[1]['lr'] == base_lr
# conv2.weight
assert param_groups[2]['lr'] == base_lr
# conv2.bias
assert param_groups[3]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
# bn.weight
assert param_groups[4]['lr'] == base_lr
# bn.bias
assert param_groups[5]['lr'] == base_lr
# sub.param1
assert param_groups[6]['lr'] == base_lr
# sub.conv1.weight
assert param_groups[7]['lr'] == base_lr
# sub.conv1.bias
assert param_groups[8]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
# sub.gn.weight
assert param_groups[9]['lr'] == base_lr
# sub.gn.bias
assert param_groups[10]['lr'] == base_lr
if OPS_AVAILABLE:
# dcn.weight
assert param_groups[11]['lr'] == base_lr
# dcn.conv_offset.weight
assert param_groups[12]['lr'] == base_lr
# dcn.conv_offset.bias
assert param_groups[13]['lr'] == base_lr
# paramwise_cfg with pseudo data parallel
model = PseudoDataParallel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
check_sgd_optimizer(optimizer, model, prefix='module.', **paramwise_cfg)
# paramwise_cfg with DataParallel
if torch.cuda.is_available():
model = torch.nn.DataParallel(ExampleModel())
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1)
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg, paramwise_cfg)
optimizer = optim_constructor(model)
check_sgd_optimizer(
optimizer, model, prefix='module.', **paramwise_cfg)
# paramwise_cfg with ExampleModel and no grad
for param in model.parameters():
param.requires_grad = False
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
param_groups = optimizer.param_groups
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == base_wd
for i, (name, param) in enumerate(model.named_parameters()):
param_group = param_groups[i]
assert torch.equal(param_group['params'][0], param)
assert param_group['momentum'] == momentum
assert param_group['lr'] == base_lr
assert param_group['weight_decay'] == base_wd
# paramwise_cfg with bypass_duplicate option
model = ExampleDuplicateModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1)
with pytest.raises(ValueError) as excinfo:
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg, paramwise_cfg)
optim_constructor(model)
assert 'some parameters appear in more than one parameter ' \
'group' == excinfo.value
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1,
bypass_duplicate=True)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
with warnings.catch_warnings(record=True) as w:
optimizer = optim_constructor(model)
warnings.simplefilter('always')
assert len(w) == 1
assert str(w[0].message) == 'conv3.0 is duplicate. It is skipped ' \
'since bypass_duplicate=True'
model_parameters = list(model.parameters())
num_params = 14 if OPS_AVAILABLE else 11
assert len(optimizer.param_groups) == len(model_parameters) == num_params
check_sgd_optimizer(optimizer, model, **paramwise_cfg)
# test DefaultOptimizerConstructor with custom_keys and ExampleModel
model = ExampleModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
custom_keys={
'param1': dict(lr_mult=10),
'sub': dict(lr_mult=0.1, decay_mult=0),
'sub.gn': dict(lr_mult=0.01),
'non_exist_key': dict(lr_mult=0.0)
},
norm_decay_mult=0.5)
with pytest.raises(TypeError):
# custom_keys should be a dict
paramwise_cfg_ = dict(custom_keys=[0.1, 0.0001])
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg, paramwise_cfg_)
optimizer = optim_constructor(model)
with pytest.raises(ValueError):
# if 'decay_mult' is specified in custom_keys, weight_decay should be
# specified
optimizer_cfg_ = dict(type='SGD', lr=0.01)
paramwise_cfg_ = dict(custom_keys={'.backbone': dict(decay_mult=0.5)})
optim_constructor = DefaultOptimizerConstructor(
optimizer_cfg_, paramwise_cfg_)
optimizer = optim_constructor(model)
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
# check optimizer type and default config
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == base_wd
# check params groups
param_groups = optimizer.param_groups
groups = []
group_settings = []
# group 1, matches of 'param1'
# 'param1' is the longest match for 'sub.param1'
groups.append(['param1', 'sub.param1'])
group_settings.append({
'lr': base_lr * 10,
'momentum': momentum,
'weight_decay': base_wd,
})
# group 2, matches of 'sub.gn'
groups.append(['sub.gn.weight', 'sub.gn.bias'])
group_settings.append({
'lr': base_lr * 0.01,
'momentum': momentum,
'weight_decay': base_wd,
})
# group 3, matches of 'sub'
groups.append(['sub.conv1.weight', 'sub.conv1.bias'])
group_settings.append({
'lr': base_lr * 0.1,
'momentum': momentum,
'weight_decay': 0,
})
# group 4, bn is configured by 'norm_decay_mult'
groups.append(['bn.weight', 'bn.bias'])
group_settings.append({
'lr': base_lr,
'momentum': momentum,
'weight_decay': base_wd * 0.5,
})
# group 5, default group
groups.append(['conv1.weight', 'conv2.weight', 'conv2.bias'])
group_settings.append({
'lr': base_lr,
'momentum': momentum,
'weight_decay': base_wd
})
num_params = 14 if OPS_AVAILABLE else 11
assert len(param_groups) == num_params
for i, (name, param) in enumerate(model.named_parameters()):
assert torch.equal(param_groups[i]['params'][0], param)
for group, settings in zip(groups, group_settings):
if name in group:
for setting in settings:
assert param_groups[i][setting] == settings[
setting], f'{name} {setting}'
# test DefaultOptimizerConstructor with custom_keys and ExampleModel 2
model = ExampleModel()
optimizer_cfg = dict(type='SGD', lr=base_lr, momentum=momentum)
paramwise_cfg = dict(custom_keys={'param1': dict(lr_mult=10)})
optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
paramwise_cfg)
optimizer = optim_constructor(model)
# check optimizer type and default config
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == 0
# check params groups
param_groups = optimizer.param_groups
groups = []
group_settings = []
# group 1, matches of 'param1'
groups.append(['param1', 'sub.param1'])
group_settings.append({
'lr': base_lr * 10,
'momentum': momentum,
'weight_decay': 0,
})
# group 2, default group
groups.append([
'sub.conv1.weight', 'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias',
'conv1.weight', 'conv2.weight', 'conv2.bias', 'bn.weight', 'bn.bias'
])
group_settings.append({
'lr': base_lr,
'momentum': momentum,
'weight_decay': 0
})
num_params = 14 if OPS_AVAILABLE else 11
assert len(param_groups) == num_params
for i, (name, param) in enumerate(model.named_parameters()):
assert torch.equal(param_groups[i]['params'][0], param)
for group, settings in zip(groups, group_settings):
if name in group:
for setting in settings:
assert param_groups[i][setting] == settings[
setting], f'{name} {setting}'
def test_torch_optimizers():
torch_optimizers = [
'ASGD', 'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'LBFGS',
'Optimizer', 'RMSprop', 'Rprop', 'SGD', 'SparseAdam'
]
assert set(torch_optimizers).issubset(set(TORCH_OPTIMIZERS))
def test_build_optimizer_constructor():
model = ExampleModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
paramwise_cfg = dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1)
optim_constructor_cfg = dict(
type='DefaultOptimizerConstructor',
optimizer_cfg=optimizer_cfg,
paramwise_cfg=paramwise_cfg)
optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
optimizer = optim_constructor(model)
check_sgd_optimizer(optimizer, model, **paramwise_cfg)
from mmcv.runner import OPTIMIZERS
from mmcv.utils import build_from_cfg
@OPTIMIZER_BUILDERS.register_module()
class MyOptimizerConstructor(DefaultOptimizerConstructor):
def __call__(self, model):
if hasattr(model, 'module'):
model = model.module
conv1_lr_mult = self.paramwise_cfg.get('conv1_lr_mult', 1.)
params = []
for name, param in model.named_parameters():
param_group = {'params': [param]}
if name.startswith('conv1') and param.requires_grad:
param_group['lr'] = self.base_lr * conv1_lr_mult
params.append(param_group)
optimizer_cfg['params'] = params
return build_from_cfg(optimizer_cfg, OPTIMIZERS)
paramwise_cfg = dict(conv1_lr_mult=5)
optim_constructor_cfg = dict(
type='MyOptimizerConstructor',
optimizer_cfg=optimizer_cfg,
paramwise_cfg=paramwise_cfg)
optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
optimizer = optim_constructor(model)
param_groups = optimizer.param_groups
assert isinstance(optimizer, torch.optim.SGD)
assert optimizer.defaults['lr'] == base_lr
assert optimizer.defaults['momentum'] == momentum
assert optimizer.defaults['weight_decay'] == base_wd
for i, param in enumerate(model.parameters()):
param_group = param_groups[i]
assert torch.equal(param_group['params'][0], param)
assert param_group['momentum'] == momentum
# conv1.weight
assert param_groups[1]['lr'] == base_lr * paramwise_cfg['conv1_lr_mult']
assert param_groups[1]['weight_decay'] == base_wd
def test_build_optimizer():
model = ExampleModel()
optimizer_cfg = dict(
type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
optimizer = build_optimizer(model, optimizer_cfg)
check_default_optimizer(optimizer, model)
model = ExampleModel()
optimizer_cfg = dict(
type='SGD',
lr=base_lr,
weight_decay=base_wd,
momentum=momentum,
paramwise_cfg=dict(
bias_lr_mult=2,
bias_decay_mult=0.5,
norm_decay_mult=0,
dwconv_decay_mult=0.1,
dcn_offset_lr_mult=0.1))
optimizer = build_optimizer(model, optimizer_cfg)
check_sgd_optimizer(optimizer, model, **optimizer_cfg['paramwise_cfg'])