diff --git a/docs/zh_cn/tutorials/hook.md b/docs/zh_cn/tutorials/hook.md index d74c426d6a048bd9ae78b2207657e59f1be46e59..14913abe2f815028fdf6bf09c696d509491e043f 100644 --- a/docs/zh_cn/tutorials/hook.md +++ b/docs/zh_cn/tutorials/hook.md @@ -114,7 +114,7 @@ default_hooks = dict(checkpoint=dict(type='CheckpointHook', save_best='auto')) 也å¯ä»¥ç›´æŽ¥æŒ‡å®š `save_best` çš„å€¼ä¸ºè¯„ä»·æŒ‡æ ‡ï¼Œä¾‹å¦‚åœ¨åˆ†ç±»ä»»åŠ¡ä¸ï¼Œå¯ä»¥æŒ‡å®šä¸º `save_best='top-1'`ï¼Œåˆ™ä¼šæ ¹æ® `'top-1'` 的值判æ–当å‰æƒé‡æ˜¯å¦æœ€ä¼˜ã€‚ -除了 `save_best` å‚数,和ä¿å˜æœ€ä¼˜æƒé‡ç›¸å…³çš„å‚数还有 `rule`,`greater_keys` å’Œ `less_keys`,这三者用æ¥åˆ¤æ– `save_bes` 的值是越大越好还是越å°è¶Šå¥½ã€‚例如指定了 `save_best='top-1'`,å¯ä»¥æŒ‡å®š `rule='greater'`,则表示该值越大表示æƒé‡è¶Šå¥½ã€‚ +除了 `save_best` å‚数,和ä¿å˜æœ€ä¼˜æƒé‡ç›¸å…³çš„å‚数还有 `rule`,`greater_keys` å’Œ `less_keys`,这三者用æ¥åˆ¤æ– `save_best` 的值是越大越好还是越å°è¶Šå¥½ã€‚例如指定了 `save_best='top-1'`,å¯ä»¥æŒ‡å®š `rule='greater'`,则表示该值越大表示æƒé‡è¶Šå¥½ã€‚ - 指定ä¿å˜æƒé‡çš„路径 diff --git a/mmengine/model/averaged_model.py b/mmengine/model/averaged_model.py index 4326be1d23f9635ffdd22463435eabc44efe24ed..94da035b369ebf490ccd5c25c365e5d591cfee44 100644 --- a/mmengine/model/averaged_model.py +++ b/mmengine/model/averaged_model.py @@ -29,12 +29,13 @@ class BaseAveragedModel(nn.Module): In mmengine, we provide two ways to use the model averaging: 1. Use the model averaging module in hook: - We provide an EMAHook to apply the model averaging during training. - Add ``custom_hooks=[dict(type='EMAHook')]`` to the config or the runner. - The hook is implemented in mmengine/hooks/ema_hook.py + We provide an :class:`mmengine.hooks.EMAHook` to apply the model + averaging during training. Add ``custom_hooks=[dict(type='EMAHook')]`` + to the config or the runner. 2. Use the model averaging module directly in the algorithm. Take the ema teacher in semi-supervise as an example: + >>> from mmengine.model import ExponentialMovingAverage >>> student = ResNet(depth=50) >>> # use ema model as teacher diff --git a/mmengine/model/base_model/base_model.py b/mmengine/model/base_model/base_model.py index 19fd78452d54d6058a4fa267e5e0f91dfbc5624b..25ce071966a08e5107e6fdd3658c13b3f54c3032 100644 --- a/mmengine/model/base_model/base_model.py +++ b/mmengine/model/base_model/base_model.py @@ -92,12 +92,12 @@ class BaseModel(BaseModule): :class:`IterBasedTrainLoop` will call this method to update model parameters. The default parameter update process is as follows: - 1. Calls ``self.data_processor(data, training=False) to collect - batch_inputs and corresponding data_samples(labels). + 1. Calls ``self.data_processor(data, training=False)`` to collect + batch_inputs and corresponding data_samples(labels). 2. Calls ``self(batch_inputs, data_samples, mode='loss')`` to get raw - loss + loss 3. Calls ``self.parse_losses`` to get ``parsed_losses`` tensor used to - backward and dict of loss tensor used to log messages. + backward and dict of loss tensor used to log messages. 4. Calls ``optim_wrapper.update_params(loss)`` to update model. Args: diff --git a/mmengine/model/wrappers/distributed.py b/mmengine/model/wrappers/distributed.py index 889dd4c455c7db4054ec52e7536b093442ff26b7..3631090055da0cef246af784214a9c2f16415fb2 100644 --- a/mmengine/model/wrappers/distributed.py +++ b/mmengine/model/wrappers/distributed.py @@ -26,9 +26,10 @@ class MMDistributedDataParallel(DistributedDataParallel): default model forward, gradient back propagation, parameter updating logic. To take advantage of DistributedDataParallel's automatic gradient synchronization, ``train_step`` calls ``DistributedDataParallel.forward`` - to calculate the losses, and call other methods of :obj:`BaseModel` to + to calculate the losses, and call other methods of :class:`BaseModel` to pre-process data and parse losses. Finally, update model parameters by - :obj:``OptimWrapper`` and return the loss dictionary used for logging. + :class:`OptimWrapper` and return the loss dictionary used + for logging. - ``val_step``: Called by ``runner.val_loop`` and get the inference results. Since there is no gradient synchronization requirement, @@ -43,11 +44,10 @@ class MMDistributedDataParallel(DistributedDataParallel): the computational graph with `loss` as the root. There are two cases - - Parameters were not used during - forward pass. - - Parameters were not used to produce - loss. - Default: False. + - Parameters were not used during forward pass. + - Parameters were not used to produce loss. + + Defaults to False. **kwargs: keyword arguments passed to ``DistributedDataParallel``. @@ -57,8 +57,8 @@ class MMDistributedDataParallel(DistributedDataParallel): output for single-device CUDA modules. - dim (int): Defaults to 0. - broadcast_buffers (bool): Flag that enables syncing ( - broadcasting) buffers of the module at beginning of the - ``forward`` function. Defaults to True + broadcasting) buffers of the module at beginning of the + ``forward`` function. Defaults to True - find_unused_parameters (bool): Whether to find parameters of module, which are not in the forward graph. Defaults to False. - process_group (ProcessGroup, optional): The process group to be @@ -70,7 +70,8 @@ class MMDistributedDataParallel(DistributedDataParallel): - gradient_as_bucket_view (bool): Defaults to False. - static_graph (bool): Defaults to False. - See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_ # noqa E501 + See more information about arguments in + :class:`torch.nn.parallel.DistributedDataParallel`. Note: If model has multiple submodules and each module has @@ -100,7 +101,7 @@ class MMDistributedDataParallel(DistributedDataParallel): :meth:`train_step` will perform the following steps in order: - If :attr:`module` defines the preprocess method, - call ``module.preprocess`` to pre-processing data. + call ``module.preprocess`` to pre-processing data. - Call ``module.forward(**data)`` and get losses. - Parse losses. - Call ``optim_wrapper.optimizer_step`` to update parameters. diff --git a/mmengine/model/wrappers/seperate_distributed.py b/mmengine/model/wrappers/seperate_distributed.py index b730632606858ed9feff1c80c8eb81506b1d27c1..ac9c2383c325282a19d655b156820b711de04d53 100644 --- a/mmengine/model/wrappers/seperate_distributed.py +++ b/mmengine/model/wrappers/seperate_distributed.py @@ -41,7 +41,7 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel): find_unused_parameters (bool): Same as that in ``torch.nn.parallel.distributed.DistributedDataParallel``. Traverse the autograd graph of all tensors contained in returned - value of the wrapped module’s forward function. Defaults to False. + value of the wrapped module's forward function. Defaults to False. **kwargs: Keyword arguments passed to ``MMDistributedDataParallel``. - device_ids (List[int] or torch.device, optional): CUDA devices @@ -58,7 +58,8 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel): - gradient_as_bucket_view (bool): Defaults to False. - static_graph (bool): Defaults to False. - See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_ # noqa E501 + See more information about arguments in + :class:`torch.nn.parallel.DistributedDataParallel`. """ def __init__(self, @@ -144,7 +145,7 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel): Args: mode (bool): whether to set training mode (``True``) or evaluation - mode (``False``). Default: ``True``. + mode (``False``). Defaults to ``True``. Returns: Module: self.