From a7d4b7c7428061c8b85360a0e4e07d95eb1d5261 Mon Sep 17 00:00:00 2001
From: Mashiro <57566630+HAOCHENYE@users.noreply.github.com>
Date: Fri, 14 Apr 2023 19:15:04 +0800
Subject: [PATCH] [Enhance] Support configuring directory used to synchronize
 results in BaseMetric (#1074)

* [Enhance] Support configuring synchronize directory for BaseMetric

* Raise error if tmpdir is not an shared dirctory for ann ranks

* Raise error if tmpdir is not an shared dirctory for ann ranks

* Update mmengine/evaluator/metric.py

Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* refine

* Update mmengine/evaluator/metric.py

---------

Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
---
 mmengine/dist/dist.py               |  5 +++++
 mmengine/evaluator/metric.py        | 32 +++++++++++++++++++++++++----
 tests/test_evaluator/test_metric.py |  7 +++++++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/mmengine/dist/dist.py b/mmengine/dist/dist.py
index b989c45a..f23f1d06 100644
--- a/mmengine/dist/dist.py
+++ b/mmengine/dist/dist.py
@@ -1000,6 +1000,11 @@ def collect_results_cpu(result_part: list,
         part_list = []
         for i in range(world_size):
             path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            if not osp.exists(path):
+                raise FileNotFoundError(
+                    f'{tmpdir} is not an shared directory for '
+                    f'rank {i}, please make sure {tmpdir} is a shared '
+                    'directory for all ranks!')
             with open(path, 'rb') as f:
                 part_list.append(pickle.load(f))
         # sort the results
diff --git a/mmengine/evaluator/metric.py b/mmengine/evaluator/metric.py
index a32e2848..3b1fc62b 100644
--- a/mmengine/evaluator/metric.py
+++ b/mmengine/evaluator/metric.py
@@ -32,17 +32,28 @@ class BaseMetric(metaclass=ABCMeta):
             names to disambiguate homonymous metrics of different evaluators.
             If prefix is not provided in the argument, self.default_prefix
             will be used instead. Default: None
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
     """
 
     default_prefix: Optional[str] = None
 
     def __init__(self,
                  collect_device: str = 'cpu',
-                 prefix: Optional[str] = None) -> None:
+                 prefix: Optional[str] = None,
+                 collect_dir: Optional[str] = None) -> None:
+        if collect_dir is not None and collect_device != 'cpu':
+            raise ValueError('`collec_dir` could only be configured when '
+                             "`collect_device='cpu'`")
+
         self._dataset_meta: Union[None, dict] = None
         self.collect_device = collect_device
         self.results: List[Any] = []
         self.prefix = prefix or self.default_prefix
+        self.collect_dir = collect_dir
+
         if self.prefix is None:
             print_log(
                 'The prefix is not set in metric class '
@@ -107,7 +118,14 @@ class BaseMetric(metaclass=ABCMeta):
                 logger='current',
                 level=logging.WARNING)
 
-        results = collect_results(self.results, size, self.collect_device)
+        if self.collect_device == 'cpu':
+            results = collect_results(
+                self.results,
+                size,
+                self.collect_device,
+                tmpdir=self.collect_dir)
+        else:
+            collect_results(self.results, size, self.collect_device)
 
         if is_main_process():
             # cast all tensors in results list to cpu
@@ -140,12 +158,18 @@ class DumpResults(BaseMetric):
         collect_device (str): Device name used for collecting results from
             different ranks during distributed training. Must be 'cpu' or
             'gpu'. Defaults to 'cpu'.
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
     """
 
     def __init__(self,
                  out_file_path: str,
-                 collect_device: str = 'cpu') -> None:
-        super().__init__(collect_device=collect_device)
+                 collect_device: str = 'cpu',
+                 collect_dir: Optional[str] = None) -> None:
+        super().__init__(
+            collect_device=collect_device, collect_dir=collect_dir)
         if not out_file_path.endswith(('.pkl', '.pickle')):
             raise ValueError('The output file must be a pkl file.')
         self.out_file_path = out_file_path
diff --git a/tests/test_evaluator/test_metric.py b/tests/test_evaluator/test_metric.py
index fb7a181d..055bd73c 100644
--- a/tests/test_evaluator/test_metric.py
+++ b/tests/test_evaluator/test_metric.py
@@ -17,6 +17,13 @@ class TestDumpResults(TestCase):
                                     'The output file must be a pkl file.'):
             DumpResults(out_file_path='./results.json')
 
+        # collect_dir could only be configured when collect_device='cpu'
+        with self.assertRaises(ValueError):
+            DumpResults(
+                out_file_path='./results.json',
+                collect_device='gpu',
+                collect_dir='./tmp')
+
     def test_process(self):
         metric = DumpResults(out_file_path='./results.pkl')
         data_samples = [dict(data=(Tensor([1, 2, 3]), Tensor([4, 5, 6])))]
-- 
GitLab