diff --git a/mmengine/dist/dist.py b/mmengine/dist/dist.py index b989c45a7ca7aa5c07c92d54a3e1be71d6e10efc..f23f1d067df591adaaf19bb73cc827e0dab609c8 100644 --- a/mmengine/dist/dist.py +++ b/mmengine/dist/dist.py @@ -1000,6 +1000,11 @@ def collect_results_cpu(result_part: list, part_list = [] for i in range(world_size): path = osp.join(tmpdir, f'part_{i}.pkl') # type: ignore + if not osp.exists(path): + raise FileNotFoundError( + f'{tmpdir} is not an shared directory for ' + f'rank {i}, please make sure {tmpdir} is a shared ' + 'directory for all ranks!') with open(path, 'rb') as f: part_list.append(pickle.load(f)) # sort the results diff --git a/mmengine/evaluator/metric.py b/mmengine/evaluator/metric.py index a32e2848c6a5646e5786c84b1ed6499b935f69cc..3b1fc62b04aa98409854221864513517556c39f6 100644 --- a/mmengine/evaluator/metric.py +++ b/mmengine/evaluator/metric.py @@ -32,17 +32,28 @@ class BaseMetric(metaclass=ABCMeta): names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, self.default_prefix will be used instead. Default: None + collect_dir: (str, optional): Synchronize directory for collecting data + from different ranks. This argument should only be configured when + ``collect_device`` is 'cpu'. Defaults to None. + `New in version 0.7.3.` """ default_prefix: Optional[str] = None def __init__(self, collect_device: str = 'cpu', - prefix: Optional[str] = None) -> None: + prefix: Optional[str] = None, + collect_dir: Optional[str] = None) -> None: + if collect_dir is not None and collect_device != 'cpu': + raise ValueError('`collec_dir` could only be configured when ' + "`collect_device='cpu'`") + self._dataset_meta: Union[None, dict] = None self.collect_device = collect_device self.results: List[Any] = [] self.prefix = prefix or self.default_prefix + self.collect_dir = collect_dir + if self.prefix is None: print_log( 'The prefix is not set in metric class ' @@ -107,7 +118,14 @@ class BaseMetric(metaclass=ABCMeta): logger='current', level=logging.WARNING) - results = collect_results(self.results, size, self.collect_device) + if self.collect_device == 'cpu': + results = collect_results( + self.results, + size, + self.collect_device, + tmpdir=self.collect_dir) + else: + collect_results(self.results, size, self.collect_device) if is_main_process(): # cast all tensors in results list to cpu @@ -140,12 +158,18 @@ class DumpResults(BaseMetric): collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. + collect_dir: (str, optional): Synchronize directory for collecting data + from different ranks. This argument should only be configured when + ``collect_device`` is 'cpu'. Defaults to None. + `New in version 0.7.3.` """ def __init__(self, out_file_path: str, - collect_device: str = 'cpu') -> None: - super().__init__(collect_device=collect_device) + collect_device: str = 'cpu', + collect_dir: Optional[str] = None) -> None: + super().__init__( + collect_device=collect_device, collect_dir=collect_dir) if not out_file_path.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') self.out_file_path = out_file_path diff --git a/tests/test_evaluator/test_metric.py b/tests/test_evaluator/test_metric.py index fb7a181d3bd92d45576c30794ca6a0117794a455..055bd73ca16a5a5ec8f22bab74a0da50b3aadf13 100644 --- a/tests/test_evaluator/test_metric.py +++ b/tests/test_evaluator/test_metric.py @@ -17,6 +17,13 @@ class TestDumpResults(TestCase): 'The output file must be a pkl file.'): DumpResults(out_file_path='./results.json') + # collect_dir could only be configured when collect_device='cpu' + with self.assertRaises(ValueError): + DumpResults( + out_file_path='./results.json', + collect_device='gpu', + collect_dir='./tmp') + def test_process(self): metric = DumpResults(out_file_path='./results.pkl') data_samples = [dict(data=(Tensor([1, 2, 3]), Tensor([4, 5, 6])))]