diff --git a/mmengine/hooks/logger_hook.py b/mmengine/hooks/logger_hook.py index 65d5d8a4d523f206020733bb25bfad7c1e68e26a..5b5296489ae9da4e76725849a6123de7ceef8ba3 100644 --- a/mmengine/hooks/logger_hook.py +++ b/mmengine/hooks/logger_hook.py @@ -20,7 +20,7 @@ class LoggerHook(Hook): ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during training/validation/testing phase. It is used to control following - behaviers: + behaviors: - The frequency of logs update in terminal, local, tensorboad wandb.etc. - The frequency of show experiment information in terminal. @@ -41,10 +41,10 @@ class LoggerHook(Hook): of ``out_dir`` and the last level directory of ``runner.work_dir``. For example, if the input ``our_dir`` is ``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``, - then the log will be saved in ``./tmp/cur_exp``. Deafule to None. - out_suffix (Tuple[str] or str): Those filenames ending with - ``out_suffix`` will be copied to ``out_dir``. Defaults to - ('.log.json', '.log', '.py'). + then the log will be saved in ``./tmp/cur_exp``. Defaults to None. + out_suffix (Tuple[str] or str): Those files in ``runner._log_dir`` + ending with ``out_suffix`` will be copied to ``out_dir``. Defaults + to ('json', '.log', '.py'). keep_local (bool): Whether to keep local logs in the local machine when :attr:`out_dir` is specified. If False, the local log will be removed. Defaults to True. @@ -53,7 +53,7 @@ class LoggerHook(Hook): Defaults to None. Examples: - >>> # A simplest LoggerHook config. + >>> # The simplest LoggerHook config. >>> logger_hook_cfg = dict(interval=20) """ priority = 'BELOW_NORMAL' @@ -64,7 +64,8 @@ class LoggerHook(Hook): ignore_last: bool = True, interval_exp_name: int = 1000, out_dir: Optional[Union[str, Path]] = None, - out_suffix: Union[Sequence[str], str] = ('.log.json', '.log', '.py'), + out_suffix: Union[Sequence[str], + str] = ('.json', '.log', '.py', 'yaml'), keep_local: bool = True, file_client_args: Optional[dict] = None, ): @@ -86,6 +87,7 @@ class LoggerHook(Hook): self.keep_local = keep_local self.file_client_args = file_client_args + self.json_log_path: Optional[str] = None if self.out_dir is not None: self.file_client = FileClient.infer_client(file_client_args, self.out_dir) @@ -106,36 +108,32 @@ class LoggerHook(Hook): (f'Text logs will be saved to {self.out_dir} by ' f'{self.file_client.name} after the training process.')) - self.json_log_path = osp.join(runner.work_dir, - f'{runner.timestamp}.log.json') - self.yaml_log_path = osp.join(runner.work_dir, - f'{runner.timestamp}.log.json') + self.json_log_path = f'{runner.timestamp}.json' def after_train_iter(self, runner, batch_idx: int, data_batch: DATA_BATCH = None, outputs: Optional[dict] = None) -> None: - """Record training logs after training iteration. + """Record logs after training iteration. Args: runner (Runner): The runner of the training process. batch_idx (int): The index of the current batch in the train loop. data_batch (Sequence[dict], optional): Data from dataloader. Defaults to None. - outputs (dict, optional): Outputs from model. - Defaults to None. + outputs (dict, optional): Outputs from model. Defaults to None. """ # Print experiment name every n iterations. if self.every_n_iters(runner, self.interval_exp_name) or (self.end_of_epoch( - runner.train_dataloader, batch_idx)): + runner.train_loop.dataloader, batch_idx)): exp_info = f'Exp name: {runner.experiment_name}' runner.logger.info(exp_info) if self.every_n_inner_iters(batch_idx, self.interval): tag, log_str = runner.log_processor.get_log_after_iter( runner, batch_idx, 'train') - elif (self.end_of_epoch(runner.train_dataloader, batch_idx) + elif (self.end_of_epoch(runner.train_loop.dataloader, batch_idx) and not self.ignore_last): # `runner.max_iters` may not be divisible by `self.interval`. if # `self.ignore_last==True`, the log of remaining iterations will @@ -146,8 +144,8 @@ class LoggerHook(Hook): else: return runner.logger.info(log_str) - # TODO compatible with visualizer. - runner.visualizer.add_scalars(tag, step=runner.iter + 1) + runner.visualizer.add_scalars( + tag, step=runner.iter + 1, file_path=self.json_log_path) def after_val_iter( self, @@ -155,17 +153,18 @@ class LoggerHook(Hook): batch_idx: int, data_batch: DATA_BATCH = None, outputs: Optional[Sequence[BaseDataElement]] = None) -> None: - """Record validation logs after validation iteration. + """Record logs after validation iteration. Args: - runner (Runner): The runner of the training process. - batch_idx (int): The index of the current batch in the train loop. - data_batch (Sequence[Tuple[Any, BaseDataElement]], optional): - Data from dataloader. Defaults to None. + runner (Runner): The runner of the validation process. + batch_idx (int): The index of the current batch in the validation + loop. + data_batch (Sequence[dict], optional): Data from dataloader. + Defaults to None. outputs (sequence, optional): Outputs from model. Defaults to None. """ if self.every_n_inner_iters(batch_idx, self.interval): - tag, log_str = runner.log_processor.get_log_after_iter( + _, log_str = runner.log_processor.get_log_after_iter( runner, batch_idx, 'val') runner.logger.info(log_str) @@ -175,39 +174,39 @@ class LoggerHook(Hook): batch_idx: int, data_batch: DATA_BATCH = None, outputs: Optional[Sequence[BaseDataElement]] = None) -> None: - """Record testing logs after iteration. + """Record logs after testing iteration. Args: - runner (Runner): The runner of the training process. - batch_idx (int): The index of the current batch in the train loop. - data_batch (Sequence[Tuple[Any, BaseDataElement]], optional): - Data from dataloader. Defaults to None. + runner (Runner): The runner of the testing process. + batch_idx (int): The index of the current batch in the test loop. + data_batch (Sequence[dict], optional): Data from dataloader. + Defaults to None. outputs (sequence, optional): Outputs from model. Defaults to None. """ if self.every_n_inner_iters(batch_idx, self.interval): - tag, log_str = runner.log_processor.get_log_after_iter( + _, log_str = runner.log_processor.get_log_after_iter( runner, batch_idx, 'test') runner.logger.info(log_str) def after_val_epoch(self, runner) -> None: - """Record validation logs after validation epoch. + """Record logs after validation epoch. Args: - runner (Runner): The runner of the training process. + runner (Runner): The runner of the validation process. """ tag, log_str = runner.log_processor.get_log_after_epoch( runner, len(runner.val_dataloader), 'val') runner.logger.info(log_str) - # TODO compatible with visualizer. - runner.visualizer.add_scalars(tag, step=runner.iter + 1) + runner.visualizer.add_scalars( + tag, step=runner.iter, file_path=self.json_log_path) def after_test_epoch(self, runner) -> None: - """Record testing logs after test epoch. + """Record logs after testing epoch. Args: - runner (Runner): The runner of the training process. + runner (Runner): The runner of the testing process. """ - tag, log_str = runner.log_processor.get_log_after_epoch( + _, log_str = runner.log_processor.get_log_after_epoch( runner, len(runner.val_dataloader), 'test') runner.logger.info(log_str) @@ -215,13 +214,14 @@ class LoggerHook(Hook): """Copy logs to ``self.out_dir`` if ``self.out_dir is not None`` Args: - runner (Runner): The runner of the training process. + runner (Runner): The runner of the training/testing/validation + process. """ # copy or upload logs to self.out_dir if self.out_dir is None: return - for filename in scandir(runner.work_dir, self.out_suffix, True): - local_filepath = osp.join(runner.work_dir, filename) + for filename in scandir(runner._log_dir, self.out_suffix, True): + local_filepath = osp.join(runner._log_dir, filename) out_filepath = self.file_client.join_path(self.out_dir, filename) with open(local_filepath, 'r') as f: self.file_client.put_text(f.read(), out_filepath) diff --git a/mmengine/logging/log_processor.py b/mmengine/logging/log_processor.py index 3619bf4b7983d0cc1d15dec25f30362c1a838a30..8454112345a78e808dcc76881bd15f4539cc19c7 100644 --- a/mmengine/logging/log_processor.py +++ b/mmengine/logging/log_processor.py @@ -181,7 +181,7 @@ class LogProcessor: """Format log string after validation or testing epoch. Args: - runner (Runner): The runner of training phase. + runner (Runner): The runner of validation/testing phase. batch_idx (int): The index of the current batch in the current loop. mode (str): Current mode of runner. @@ -200,10 +200,12 @@ class LogProcessor: custom_cfg_copy = self._parse_windows_size(runner, batch_idx) # tag is used to write log information to different backends. tag = self._collect_scalars(custom_cfg_copy, runner, mode) - # validation log string needs cur epoch/iteration and max - # epochs/iterations. test log string only needs length of test - # dataloader. - cur_iter = self._get_iter(runner, batch_idx) + # By epoch: + # Epoch(val) [10][1000/1000] ... + # Epoch(test) [1000/1000] ... + # By iteration: + # Iteration(val) [1000/1000] ... + # Iteration(test) [1000/1000] ... if self.by_epoch: if mode == 'val': cur_epoch = self._get_epoch(runner, mode) @@ -214,12 +216,9 @@ class LogProcessor: f'Epoch({mode}) [{dataloader_len}/{dataloader_len}] ') else: - if mode == 'train': - log_str = (f'Iter({mode}) [{cur_iter}/' - f'{runner.train_loop.max_iters}] ') - else: - log_str = ( - f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ') + log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ') + # `time` and `data_time` will not be recorded in after epoch log + # message. log_items = [] for name, val in tag.items(): if name in ('time', 'data_time'): @@ -237,9 +236,9 @@ class LogProcessor: Args: custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int ``window_size``. - runner (Runner): The runner of the training process. - mode (str): 'train' or 'val', which means the prefix attached by - runner. + runner (Runner): The runner of the training/testing/validation + process. + mode (str): Current mode of runner. Returns: dict: Statistical values of logs. @@ -253,7 +252,7 @@ class LogProcessor: # according to mode. for prefix_key, log_buffer in history_scalars.items(): if prefix_key.startswith(mode): - key = prefix_key.split('/')[-1] + key = prefix_key.partition('/')[-1] mode_history_scalars[key] = log_buffer for key in mode_history_scalars: # Update the latest learning rate and smoothed time logs. @@ -287,25 +286,23 @@ class LogProcessor: ' is False.' def _check_repeated_log_name(): - check_dict = dict() # The `log_name` of the same data_src should not be repeated. # If `log_name` is not specified, `data_src` will be overwritten. # But only allowed to be overwritten once. + check_set = set() for log_cfg in self.custom_cfg: assert 'data_src' in log_cfg data_src = log_cfg['data_src'] log_name = log_cfg.get('log_name', data_src) - check_dict.setdefault(data_src, - dict(log_names=set(), log_counts=0)) - check_dict[data_src]['log_names'].add(log_name) - check_dict[data_src]['log_counts'] += 1 - assert (len( - check_dict[data_src] - ['log_names']) == check_dict[data_src]['log_counts']), ( - f'If you want to statistic {data_src} with multiple ' - 'statistics method, please check `log_name` is unique' - f'and {data_src} will not be overwritten twice. See ' - f'more information in the docstring of `LogProcessor`') + assert log_name not in check_set, ( + f'Found duplicate {log_name} for {data_src}. Please check' + 'your `custom_cfg` for `log_processor`. You should ' + f'neither define duplicate `{log_name}` for {data_src} ' + f'nor do not define any {log_name} for multiple ' + f'{data_src}, See more information in the docstring of ' + 'LogProcessor') + + check_set.add(log_name) _check_repeated_log_name() _check_window_size() @@ -314,7 +311,8 @@ class LogProcessor: """Parse window_size defined in custom_cfg to int value. Args: - runner (Runner): The runner of the training process. + runner (Runner): The runner of the training/testing/validation + process. batch_idx (int): The iteration index of current dataloader. """ custom_cfg_copy = copy.deepcopy(self.custom_cfg) @@ -337,7 +335,8 @@ class LogProcessor: for a given device. Args: - runner (Runner): The runner of the training process. + runner (Runner): The runner of the training/testing/validation + process. Returns: The maximum GPU memory occupied by tensors in megabytes for a given @@ -352,11 +351,12 @@ class LogProcessor: return int(mem_mb.item()) def _get_iter(self, runner, batch_idx: int = None) -> int: - """Get current training iteration step. + """Get current iteration index. Args: - runner (Runner): The runner of the training process. - batch_idx (int, optional): The interaction index of current + runner (Runner): The runner of the training/testing/validation + process. + batch_idx (int, optional): The iteration index of current dataloader. Defaults to None. Returns: @@ -372,8 +372,9 @@ class LogProcessor: """Get current epoch according to mode. Args: - runner (Runner): The runner of the training/validation process. - mode (str): Current mode of runner, "train" or "val". + runner (Runner): The runner of the training/testing/validation + process. + mode (str): Current mode of runner. Returns: int: The current epoch. @@ -395,7 +396,7 @@ class LogProcessor: Args: runner (Runner): The runner of the training/validation/testing process. - mode (str): Current mode of runner, "train", "val" or test. + mode (str): Current mode of runner. Returns: BaseLoop: Current loop of runner. diff --git a/mmengine/runner/runner.py b/mmengine/runner/runner.py index 98cddde636272c2e842a2d6a58380764e3479af0..ec4fccbba67eb959ab670251e54305c5251e5f38 100644 --- a/mmengine/runner/runner.py +++ b/mmengine/runner/runner.py @@ -64,7 +64,9 @@ class Runner: Args: model (:obj:`torch.nn.Module` or dict): The model to be run. It can be a dict used for build a model. - work_dir (str): The working directory to save checkpoints and logs. + work_dir (str): The working directory to save checkpoints. The logs + will be saved in the subdirectory of `work_dir` named + :attr:`timestamp`. train_dataloader (Dataloader or dict, optional): A dataloader object or a dict to build a dataloader. If ``None`` is given, it means skipping training steps. Defaults to None. @@ -637,7 +639,7 @@ class Runner: MMLogger: A MMLogger object build from ``logger``. """ if log_file is None: - log_file = osp.join(self._log_dir, f'{self._experiment_name}.log') + log_file = osp.join(self._log_dir, f'{self.timestamp}.log') log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs) log_cfg.setdefault('name', self._experiment_name) diff --git a/tests/test_hook/test_logger_hook.py b/tests/test_hook/test_logger_hook.py index 138ce2a1f031c549483bade8e4d2e4114ae469fa..2e05b84fb685cc7932a085ea6891e7ef8c6374e8 100644 --- a/tests/test_hook/test_logger_hook.py +++ b/tests/test_hook/test_logger_hook.py @@ -15,7 +15,7 @@ class TestLoggerHook: assert logger_hook.interval == 10 assert logger_hook.ignore_last assert logger_hook.interval_exp_name == 1000 - assert logger_hook.out_suffix == ('.log.json', '.log', '.py') + assert logger_hook.out_suffix == ('.json', '.log', '.py', 'yaml') assert logger_hook.keep_local assert logger_hook.file_client_args is None assert isinstance(logger_hook.file_client.client, HardDiskBackend) @@ -29,36 +29,42 @@ class TestLoggerHook: def test_before_run(self): runner = MagicMock() runner.iter = 10 - runner.timestamp = 'timestamp' + runner.timestamp = '20220429' + runner._log_dir = f'work_dir/{runner.timestamp}' runner.work_dir = 'work_dir' runner.logger = MagicMock() logger_hook = LoggerHook(out_dir='out_dir') logger_hook.before_run(runner) assert logger_hook.out_dir == osp.join('out_dir', 'work_dir') - assert logger_hook.json_log_path == osp.join('work_dir', - 'timestamp.log.json') + assert logger_hook.json_log_path == f'{runner.timestamp}.json' def test_after_run(self, tmp_path): # Test + timestamp = '20220429' out_dir = tmp_path / 'out_dir' out_dir.mkdir() work_dir = tmp_path / 'work_dir' work_dir.mkdir() - work_dir_json = work_dir / 'tmp.log.json' + log_dir = work_dir / timestamp + log_dir.mkdir() + log_dir_json = log_dir / 'tmp.log.json' runner = MagicMock() - runner.work_dir = work_dir + runner._log_dir = str(log_dir) + runner.timestamp = timestamp + runner.work_dir = str(work_dir) # Test without out_dir. logger_hook = LoggerHook() logger_hook.after_run(runner) # Test with out_dir and make sure json file has been moved to out_dir. - json_f = open(work_dir_json, 'w') + json_f = open(log_dir_json, 'w') json_f.close() - logger_hook = LoggerHook(out_dir=str(tmp_path), keep_local=False) + logger_hook = LoggerHook(out_dir=str(out_dir), keep_local=False) logger_hook.out_dir = str(out_dir) + logger_hook.before_run(runner) logger_hook.after_run(runner) # Verify that the file has been moved to `out_dir`. - assert not osp.exists(str(work_dir_json)) - assert osp.exists(str(out_dir / 'tmp.log.json')) + assert not osp.exists(str(log_dir_json)) + assert osp.exists(str(out_dir / 'work_dir' / 'tmp.log.json')) def test_after_train_iter(self): # Test LoggerHook by iter. @@ -89,7 +95,7 @@ class TestLoggerHook: runner.log_processor.get_log_after_iter = MagicMock( return_value=(dict(), 'log_str')) logger_hook = LoggerHook(ignore_last=False) - runner.train_dataloader = [0] * 5 + runner.train_loop.dataloader = [0] * 5 logger_hook.after_train_iter(runner, batch_idx=4) runner.log_processor.get_log_after_iter.assert_called()