diff --git a/docs/zh_cn/tutorials/hook.md b/docs/zh_cn/tutorials/hook.md
index d74c426d6a048bd9ae78b2207657e59f1be46e59..14913abe2f815028fdf6bf09c696d509491e043f 100644
--- a/docs/zh_cn/tutorials/hook.md
+++ b/docs/zh_cn/tutorials/hook.md
@@ -114,7 +114,7 @@ default_hooks = dict(checkpoint=dict(type='CheckpointHook', save_best='auto'))
 
 ä¹Ÿå¯ä»¥ç›´æŽ¥æŒ‡å®š `save_best` çš„å€¼ä¸ºè¯„ä»·æŒ‡æ ‡ï¼Œä¾‹å¦‚åœ¨åˆ†ç±»ä»»åŠ¡ä¸ï¼Œå¯ä»¥æŒ‡å®šä¸º `save_best='top-1'`ï¼Œåˆ™ä¼šæ ¹æ® `'top-1'` çš„å€¼åˆ¤æ–å½“å‰æƒé‡æ˜¯å¦æœ€ä¼˜ã€‚
 
-é™¤äº† `save_best` å‚æ•°ï¼Œå’Œä¿å˜æœ€ä¼˜æƒé‡ç›¸å…³çš„å‚æ•°è¿˜æœ‰ `rule`ï¼Œ`greater_keys` å’Œ `less_keys`ï¼Œè¿™ä¸‰è€…ç”¨æ¥åˆ¤æ– `save_bes` çš„å€¼æ˜¯è¶Šå¤§è¶Šå¥½è¿˜æ˜¯è¶Šå°è¶Šå¥½ã€‚ä¾‹å¦‚æŒ‡å®šäº† `save_best='top-1'`ï¼Œå¯ä»¥æŒ‡å®š `rule='greater'`ï¼Œåˆ™è¡¨ç¤ºè¯¥å€¼è¶Šå¤§è¡¨ç¤ºæƒé‡è¶Šå¥½ã€‚
+é™¤äº† `save_best` å‚æ•°ï¼Œå’Œä¿å˜æœ€ä¼˜æƒé‡ç›¸å…³çš„å‚æ•°è¿˜æœ‰ `rule`ï¼Œ`greater_keys` å’Œ `less_keys`ï¼Œè¿™ä¸‰è€…ç”¨æ¥åˆ¤æ– `save_best` çš„å€¼æ˜¯è¶Šå¤§è¶Šå¥½è¿˜æ˜¯è¶Šå°è¶Šå¥½ã€‚ä¾‹å¦‚æŒ‡å®šäº† `save_best='top-1'`ï¼Œå¯ä»¥æŒ‡å®š `rule='greater'`ï¼Œåˆ™è¡¨ç¤ºè¯¥å€¼è¶Šå¤§è¡¨ç¤ºæƒé‡è¶Šå¥½ã€‚
 
 - æŒ‡å®šä¿å˜æƒé‡çš„è·¯å¾„
 
diff --git a/mmengine/model/averaged_model.py b/mmengine/model/averaged_model.py
index 4326be1d23f9635ffdd22463435eabc44efe24ed..94da035b369ebf490ccd5c25c365e5d591cfee44 100644
--- a/mmengine/model/averaged_model.py
+++ b/mmengine/model/averaged_model.py
@@ -29,12 +29,13 @@ class BaseAveragedModel(nn.Module):
     In mmengine, we provide two ways to use the model averaging:
 
     1. Use the model averaging module in hook:
-       We provide an EMAHook to apply the model averaging during training.
-       Add ``custom_hooks=[dict(type='EMAHook')]`` to the config or the runner.
-       The hook is implemented in mmengine/hooks/ema_hook.py
+       We provide an :class:`mmengine.hooks.EMAHook` to apply the model
+       averaging during training. Add ``custom_hooks=[dict(type='EMAHook')]``
+       to the config or the runner.
 
     2. Use the model averaging module directly in the algorithm. Take the ema
        teacher in semi-supervise as an example:
+
        >>> from mmengine.model import ExponentialMovingAverage
        >>> student = ResNet(depth=50)
        >>> # use ema model as teacher
diff --git a/mmengine/model/base_model/base_model.py b/mmengine/model/base_model/base_model.py
index 19fd78452d54d6058a4fa267e5e0f91dfbc5624b..25ce071966a08e5107e6fdd3658c13b3f54c3032 100644
--- a/mmengine/model/base_model/base_model.py
+++ b/mmengine/model/base_model/base_model.py
@@ -92,12 +92,12 @@ class BaseModel(BaseModule):
         :class:`IterBasedTrainLoop` will call this method to update model
         parameters. The default parameter update process is as follows:
 
-        1. Calls ``self.data_processor(data, training=False) to collect
-          batch_inputs and corresponding data_samples(labels).
+        1. Calls ``self.data_processor(data, training=False)`` to collect
+           batch_inputs and corresponding data_samples(labels).
         2. Calls ``self(batch_inputs, data_samples, mode='loss')`` to get raw
-          loss
+           loss
         3. Calls ``self.parse_losses`` to get ``parsed_losses`` tensor used to
-          backward and dict of loss tensor used to log messages.
+           backward and dict of loss tensor used to log messages.
         4. Calls ``optim_wrapper.update_params(loss)`` to update model.
 
         Args:
diff --git a/mmengine/model/wrappers/distributed.py b/mmengine/model/wrappers/distributed.py
index 889dd4c455c7db4054ec52e7536b093442ff26b7..3631090055da0cef246af784214a9c2f16415fb2 100644
--- a/mmengine/model/wrappers/distributed.py
+++ b/mmengine/model/wrappers/distributed.py
@@ -26,9 +26,10 @@ class MMDistributedDataParallel(DistributedDataParallel):
       default model forward, gradient back propagation, parameter updating
       logic. To take advantage of DistributedDataParallel's automatic gradient
       synchronization, ``train_step`` calls ``DistributedDataParallel.forward``
-      to calculate the losses, and call other methods of :obj:`BaseModel` to
+      to calculate the losses, and call other methods of :class:`BaseModel` to
       pre-process data and parse losses. Finally, update model parameters by
-      :obj:``OptimWrapper`` and return the loss dictionary used for logging.
+      :class:`OptimWrapper` and return the loss dictionary used
+      for logging.
 
     - ``val_step``: Called by ``runner.val_loop`` and get the inference
       results. Since there is no gradient synchronization requirement,
@@ -43,11 +44,10 @@ class MMDistributedDataParallel(DistributedDataParallel):
             the computational graph with `loss` as the root.
             There are two cases
 
-                - Parameters were not used during
-                  forward pass.
-                - Parameters were not used to produce
-                  loss.
-            Default: False.
+            - Parameters were not used during forward pass.
+            - Parameters were not used to produce loss.
+
+            Defaults to False.
 
         **kwargs: keyword arguments passed to ``DistributedDataParallel``.
 
@@ -57,8 +57,8 @@ class MMDistributedDataParallel(DistributedDataParallel):
               output for single-device CUDA modules.
             - dim (int): Defaults to 0.
             - broadcast_buffers (bool): Flag that enables syncing (
-                broadcasting) buffers of the module at beginning of the
-                ``forward`` function. Defaults to True
+              broadcasting) buffers of the module at beginning of the
+              ``forward`` function. Defaults to True
             - find_unused_parameters (bool): Whether to find parameters of
               module, which are not in the forward graph. Defaults to False.
             - process_group (ProcessGroup, optional): The process group to be
@@ -70,7 +70,8 @@ class MMDistributedDataParallel(DistributedDataParallel):
             - gradient_as_bucket_view (bool): Defaults to False.
             - static_graph (bool): Defaults to False.
 
-    See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_  # noqa E501
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
 
     Note:
         If model has multiple submodules and each module has
@@ -100,7 +101,7 @@ class MMDistributedDataParallel(DistributedDataParallel):
         :meth:`train_step` will perform the following steps in order:
 
         - If :attr:`module` defines the preprocess method,
-            call ``module.preprocess`` to pre-processing data.
+          call ``module.preprocess`` to pre-processing data.
         - Call ``module.forward(**data)`` and get losses.
         - Parse losses.
         - Call ``optim_wrapper.optimizer_step`` to update parameters.
diff --git a/mmengine/model/wrappers/seperate_distributed.py b/mmengine/model/wrappers/seperate_distributed.py
index b730632606858ed9feff1c80c8eb81506b1d27c1..ac9c2383c325282a19d655b156820b711de04d53 100644
--- a/mmengine/model/wrappers/seperate_distributed.py
+++ b/mmengine/model/wrappers/seperate_distributed.py
@@ -41,7 +41,7 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel):
         find_unused_parameters (bool): Same as that in
             ``torch.nn.parallel.distributed.DistributedDataParallel``.
             Traverse the autograd graph of all tensors contained in returned
-            value of the wrapped moduleâ€™s forward function. Defaults to False.
+            value of the wrapped module's forward function. Defaults to False.
         **kwargs: Keyword arguments passed to ``MMDistributedDataParallel``.
 
             - device_ids (List[int] or torch.device, optional): CUDA devices
@@ -58,7 +58,8 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel):
             - gradient_as_bucket_view (bool): Defaults to False.
             - static_graph (bool): Defaults to False.
 
-    See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_  # noqa E501
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
     """
 
     def __init__(self,
@@ -144,7 +145,7 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel):
 
         Args:
             mode (bool): whether to set training mode (``True``) or evaluation
-                 mode (``False``). Default: ``True``.
+                mode (``False``). Defaults to ``True``.
 
         Returns:
             Module: self.