From 5c6428a3cfbef3e9d48110265fc5c2b27e68db5a Mon Sep 17 00:00:00 2001 From: seuyou <3463423099@qq.com> Date: Fri, 25 Feb 2022 21:57:30 +0800 Subject: [PATCH] [Feature]: Add new constructor for MAE finetune --- .../vit-b-p16_ft-8xb128-coslr-100e_in1k.py | 1 + mmselfsup/core/optimizer/__init__.py | 6 +- mmselfsup/core/optimizer/builder.py | 5 +- mmselfsup/core/optimizer/constructor.py | 40 +----- .../optimizer/mae_finetune_constructor.py | 118 ++++++++++++++++++ 5 files changed, 127 insertions(+), 43 deletions(-) create mode 100644 mmselfsup/core/optimizer/mae_finetune_constructor.py diff --git a/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py b/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py index 462a50a9c..aead1b430 100644 --- a/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py +++ b/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py @@ -45,6 +45,7 @@ 'pos_embed': dict(weight_decay=0.), 'cls_token': dict(weight_decay=0.) }, + constructor='MAEFtOptimizerConstructor', layer_decay=0.65) # learning policy diff --git a/mmselfsup/core/optimizer/__init__.py b/mmselfsup/core/optimizer/__init__.py index 9f25370c2..3378fa0fa 100644 --- a/mmselfsup/core/optimizer/__init__.py +++ b/mmselfsup/core/optimizer/__init__.py @@ -1,6 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .builder import build_optimizer from .constructor import DefaultOptimizerConstructor +from .mae_finetune_constructor import MAEFtOptimizerConstructor from .optimizers import LARS -__all__ = ['LARS', 'build_optimizer', 'DefaultOptimizerConstructor'] +__all__ = [ + 'LARS', 'build_optimizer', 'DefaultOptimizerConstructor', + 'MAEFtOptimizerConstructor' +] diff --git a/mmselfsup/core/optimizer/builder.py b/mmselfsup/core/optimizer/builder.py index 9521fb8a4..a35fd6507 100644 --- a/mmselfsup/core/optimizer/builder.py +++ b/mmselfsup/core/optimizer/builder.py @@ -38,13 +38,10 @@ def build_optimizer(model, optimizer_cfg): constructor_type = optimizer_cfg.pop('constructor', 'DefaultOptimizerConstructor') paramwise_cfg = optimizer_cfg.pop('paramwise_options', None) - layer_decay = optimizer_cfg.pop('layer_decay', 0.0) - optim_constructor = build_optimizer_constructor( dict( type=constructor_type, optimizer_cfg=optimizer_cfg, - paramwise_cfg=paramwise_cfg, - layer_decay=layer_decay)) + paramwise_cfg=paramwise_cfg)) optimizer = optim_constructor(model) return optimizer diff --git a/mmselfsup/core/optimizer/constructor.py b/mmselfsup/core/optimizer/constructor.py index dc36297b4..2010f2300 100644 --- a/mmselfsup/core/optimizer/constructor.py +++ b/mmselfsup/core/optimizer/constructor.py @@ -22,9 +22,7 @@ class DefaultOptimizerConstructor: - any arguments of the corresponding optimizer type, e.g., lr, weight_decay, momentum, etc. paramwise_cfg (dict, optional): Parameter-wise options. - Defaults to None - layer_decay (float): base value for layer wise learning rate decay. - Defaults to 0.0 + Defaults to None. Example 1: >>> model = torch.nn.modules.Conv1d(1, 1, 1) @@ -37,13 +35,12 @@ class DefaultOptimizerConstructor: >>> optimizer = optim_builder(model) """ - def __init__(self, optimizer_cfg, paramwise_cfg=None, layer_decay=0.0): + def __init__(self, optimizer_cfg, paramwise_cfg=None): if not isinstance(optimizer_cfg, dict): raise TypeError('optimizer_cfg should be a dict', f'but got {type(optimizer_cfg)}') self.optimizer_cfg = optimizer_cfg self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg - self.layer_decay = layer_decay def __call__(self, model): if hasattr(model, 'module'): @@ -51,10 +48,6 @@ def __call__(self, model): optimizer_cfg = self.optimizer_cfg.copy() paramwise_options = self.paramwise_cfg - # generate layer-wise lr decay - if self.layer_decay > 0: - self._generate_layer_wise_lr_decay(model, paramwise_options) - # if no paramwise option is specified, just use the global setting if paramwise_options is None: optimizer_cfg['params'] = model.parameters() @@ -87,32 +80,3 @@ def __call__(self, model): optimizer_cfg['params'] = params return build_from_cfg(optimizer_cfg, OPTIMIZERS) - - def _generate_layer_wise_lr_decay(self, model, paramwise_options): - """Currently, we follow the same layer-wise lr decay schedule as - MAE.""" - num_layers = len(model.backbone.layers) + 1 - layer_scales = list(self.layer_decay**(num_layers - i) - for i in range(num_layers + 1)) - - if 'pos_embed' in paramwise_options: - paramwise_options['pos_embed'].update( - dict(lr_mult=layer_scales[0])) - else: - paramwise_options['pos_embed'] = dict(lr_mult=layer_scales[0]) - - if 'cls_token' in paramwise_options: - paramwise_options['cls_token'].update( - dict(lr_mult=layer_scales[0])) - else: - paramwise_options['cls_token'] = dict(lr_mult=layer_scales[0]) - - if 'patch_embed' in paramwise_options: - paramwise_options['patch_embed'].update( - dict(lr_mult=layer_scales[0])) - else: - paramwise_options['patch_embed'] = dict(lr_mult=layer_scales[0]) - - for i in range(num_layers - 1): - paramwise_options[f'backbone\\.layers\\.{i}\\.'] = dict( - lr_mult=layer_scales[i + 1]) diff --git a/mmselfsup/core/optimizer/mae_finetune_constructor.py b/mmselfsup/core/optimizer/mae_finetune_constructor.py new file mode 100644 index 000000000..e0674fd59 --- /dev/null +++ b/mmselfsup/core/optimizer/mae_finetune_constructor.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re + +import torch.distributed as dist +from mmcv.runner.optimizer.builder import OPTIMIZER_BUILDERS, OPTIMIZERS +from mmcv.utils import build_from_cfg, print_log + + +@OPTIMIZER_BUILDERS.register_module() +class MAEFtOptimizerConstructor: + """Rewrote default constructor for optimizers. By default each parameter + share the same optimizer settings, and we provide an argument + ``paramwise_cfg`` to specify parameter-wise settings and set layer-wise + learning rate. It is a dict and may contain the following fields: + + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + optimizer_cfg (dict): The config dict of the optimizer. + Positional fields are + - `type`: class name of the optimizer. + Optional fields are + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + paramwise_cfg (dict, optional): Parameter-wise options. + Defaults to None + layer_decay (float): base value for layer wise learning rate decay. + Defaults to 0.0 + + Example 1: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, + >>> weight_decay=0.0001) + >>> paramwise_cfg = dict('bias': dict(weight_decay=0., \ + lars_exclude=True)) + >>> optim_builder = DefaultOptimizerConstructor( + >>> optimizer_cfg, paramwise_cfg) + >>> optimizer = optim_builder(model) + """ + + def __init__(self, optimizer_cfg, paramwise_cfg=None): + if not isinstance(optimizer_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optimizer_cfg)}') + self.optimizer_cfg = optimizer_cfg + self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg + self.layer_decay = self.optimizer_cfg.pop('layer_decay', 0.0) + + def __call__(self, model): + if hasattr(model, 'module'): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + paramwise_options = self.paramwise_cfg + + # generate layer-wise lr decay + if self.layer_decay > 0: + self._generate_layer_wise_lr_decay(model, paramwise_options) + + # if no paramwise option is specified, just use the global setting + if paramwise_options is None: + optimizer_cfg['params'] = model.parameters() + return build_from_cfg(optimizer_cfg, OPTIMIZERS) + else: + assert isinstance(paramwise_options, dict) + params = [] + for name, param in model.named_parameters(): + param_group = {'params': [param]} + if not param.requires_grad: + params.append(param_group) + continue + + for regexp, options in paramwise_options.items(): + if re.search(regexp, name): + for key, value in options.items(): + if key.endswith('_mult'): # is a multiplier + key = key[:-5] + assert key in optimizer_cfg, \ + f'{key} not in optimizer_cfg' + value = optimizer_cfg[key] * value + param_group[key] = value + if not dist.is_initialized() or \ + dist.get_rank() == 0: + print_log(f'paramwise_options -- \ + {name}: {key}={value}') + + # otherwise use the global settings + params.append(param_group) + + optimizer_cfg['params'] = params + return build_from_cfg(optimizer_cfg, OPTIMIZERS) + + def _generate_layer_wise_lr_decay(self, model, paramwise_options): + """Currently, we follow the same layer-wise lr decay schedule as + MAE.""" + num_layers = len(model.backbone.layers) + 1 + layer_scales = list(self.layer_decay**(num_layers - i) + for i in range(num_layers + 1)) + + if 'pos_embed' in paramwise_options: + paramwise_options['pos_embed'].update( + dict(lr_mult=layer_scales[0])) + else: + paramwise_options['pos_embed'] = dict(lr_mult=layer_scales[0]) + + if 'cls_token' in paramwise_options: + paramwise_options['cls_token'].update( + dict(lr_mult=layer_scales[0])) + else: + paramwise_options['cls_token'] = dict(lr_mult=layer_scales[0]) + + if 'patch_embed' in paramwise_options: + paramwise_options['patch_embed'].update( + dict(lr_mult=layer_scales[0])) + else: + paramwise_options['patch_embed'] = dict(lr_mult=layer_scales[0]) + + for i in range(num_layers - 1): + paramwise_options[f'backbone\\.layers\\.{i}\\.'] = dict( + lr_mult=layer_scales[i + 1])