diff --git a/tests/data/annotations/dummy_annotation.json b/tests/data/annotations/dummy_annotation.json
new file mode 100644
index 0000000000000000000000000000000000000000..abba398a14b99e059843496991758f25f1ab5acc
--- /dev/null
+++ b/tests/data/annotations/dummy_annotation.json
@@ -0,0 +1,50 @@
+{
+    "metadata":
+    {
+      "dataset_type": "test_dataset",
+      "task_name": "test_task"
+    },
+    "data_infos":
+    [
+      {
+        "img_path": "test_img.jpg",
+        "height": 604,
+        "width": 640,
+        "instances":
+        [
+          {
+            "bbox": [0, 0, 10, 20],
+            "bbox_label": 1,
+            "mask": [[0,0],[0,10],[10,20],[20,0]],
+            "extra_anns": [1,2,3]
+          },
+          {
+            "bbox": [10, 10, 110, 120],
+            "bbox_label": 2,
+            "mask": [[10,10],[10,110],[110,120],[120,10]],
+            "extra_anns": [4,5,6]
+          }
+        ]
+      },
+      {
+        "img_path": "gray.jpg",
+        "height": 288,
+        "width": 512,
+        "instances":
+        [
+          {
+            "bbox": [0, 0, 10, 20],
+            "bbox_label": 1,
+            "mask": [[0,0],[0,10],[10,20],[20,0]],
+            "extra_anns": [1,2,3]
+          },
+          {
+            "bbox": [10, 10, 110, 120],
+            "bbox_label": 2,
+            "mask": [[10,10],[10,110],[110,120],[120,10]],
+            "extra_anns": [4,5,6]
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/data/annotations/wrong_annotation.json b/tests/data/annotations/wrong_annotation.json
new file mode 100644
index 0000000000000000000000000000000000000000..31ad01a28f40bd87649417bc2fe4c6c31e8e003e
--- /dev/null
+++ b/tests/data/annotations/wrong_annotation.json
@@ -0,0 +1,50 @@
+{
+    "meta":
+    {
+      "dataset_type": "test_dataset",
+      "task_name": "test_task"
+    },
+    "data":
+    [
+      {
+        "img_path": "test_img.jpg",
+        "height": 604,
+        "width": 640,
+        "instances":
+        [
+          {
+            "bbox": [0, 0, 10, 20],
+            "bbox_label": 1,
+            "mask": [[0,0],[0,10],[10,20],[20,0]],
+            "extra_anns": [1,2,3]
+          },
+          {
+            "bbox": [10, 10, 110, 120],
+            "bbox_label": 2,
+            "mask": [[10,10],[10,110],[110,120],[120,10]],
+            "extra_anns": [4,5,6]
+          }
+        ]
+      },
+      {
+        "img_path": "gray.jpg",
+        "height": 288,
+        "width": 512,
+        "instances":
+        [
+          {
+            "bbox": [0, 0, 10, 20],
+            "bbox_label": 1,
+            "mask": [[0,0],[0,10],[10,20],[20,0]],
+            "extra_anns": [1,2,3]
+          },
+          {
+            "bbox": [10, 10, 110, 120],
+            "bbox_label": 2,
+            "mask": [[10,10],[10,110],[110,120],[120,10]],
+            "extra_anns": [4,5,6]
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/data/imgs/gray.jpg b/tests/data/imgs/gray.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/data/imgs/test_img.jpg b/tests/data/imgs/test_img.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/test_data/test_base_dataset.py b/tests/test_data/test_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4f60fcfab4f5d812bfa337d391f1cbebb3fc84
--- /dev/null
+++ b/tests/test_data/test_base_dataset.py
@@ -0,0 +1,386 @@
+import os.path as osp
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from mmengine.data import (BaseDataset, ClassBalancedDataset, ConcatDataset,
+                           RepeatDataset)
+
+
+class TestBaseDataset:
+
+    def __init__(self):
+        self.base_dataset = BaseDataset
+
+        self.data_info = dict(filename='test_img.jpg', height=604, width=640)
+        self.base_dataset.parse_annotations = MagicMock(
+            return_value=self.data_info)
+
+        self.imgs = torch.rand((2, 3, 32, 32))
+        self.base_dataset.pipeline = MagicMock(
+            return_value=dict(imgs=self.imgs))
+
+    def test_init(self):
+        # test the instantiation of self.base_dataset
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+        assert dataset._fully_initialized
+        assert hasattr(dataset, 'data_infos')
+        assert hasattr(dataset, 'data_address')
+
+        # test the instantiation of self.base_dataset with
+        # `serialize_data=False`
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            serialize_data=False)
+        assert dataset._fully_initialized
+        assert hasattr(dataset, 'data_infos')
+        assert not hasattr(dataset, 'data_address')
+
+        # test the instantiation of self.base_dataset with lazy init
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            lazy_init=True)
+        assert not dataset._fully_initialized
+        assert not hasattr(dataset, 'data_infos')
+
+        # test the instantiation of self.base_dataset when the ann_file is
+        # wrong
+        with pytest.raises(ValueError):
+            self.base_dataset(
+                data_root=osp.join(osp.dirname(__file__), '../data/'),
+                data_prefix=dict(img='imgs'),
+                ann_file='annotations/wrong_annotation.json')
+
+        # test the instantiation of self.base_dataset when `parse_annotations`
+        # return `list[dict]`
+        self.base_dataset.parse_annotations = MagicMock(
+            return_value=[self.data_info,
+                          self.data_info.copy()])
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+        assert dataset._fully_initialized
+        assert hasattr(dataset, 'data_infos')
+        assert hasattr(dataset, 'data_address')
+        assert len(dataset) == 4
+        assert dataset[0] == dict(imgs=self.imgs)
+        assert dataset.get_data_info(0) == self.data_info
+
+        # set self.base_dataset to initial state
+        self.__init__()
+
+    def test_meta(self):
+        # test dataset.meta with setting the meta from annotation file as the
+        # meta of self.base_dataset
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+        assert dataset.meta == dict(
+            dataset_type='test_dataset', task_name='test_task')
+
+        # test dataset.meta with setting META in self.base_dataset
+        dataset_type = 'new_dataset'
+        self.base_dataset.META = dict(
+            dataset_type=dataset_type, classes=('dog', 'cat'))
+
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+        assert dataset.meta == dict(
+            dataset_type=dataset_type,
+            task_name='test_task',
+            classes=('dog', 'cat'))
+
+        # test dataset.meta with passing meta into self.base_dataset
+        meta = dict(classes=('dog', ))
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            meta=meta)
+        assert self.base_dataset.META == dict(
+            dataset_type=dataset_type, classes=('dog', 'cat'))
+        assert dataset.meta == dict(
+            dataset_type=dataset_type,
+            task_name='test_task',
+            classes=('dog', ))
+        # reset `base_dataset.META`, the `dataset.meta` should not change
+        self.base_dataset.META['classes'] = ('dog', 'cat', 'fish')
+        assert self.base_dataset.META == dict(
+            dataset_type=dataset_type, classes=('dog', 'cat', 'fish'))
+        assert dataset.meta == dict(
+            dataset_type=dataset_type,
+            task_name='test_task',
+            classes=('dog', ))
+
+        # test dataset.meta with passing meta into self.base_dataset and
+        # lazy_init is True
+        meta = dict(classes=('dog', ))
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            meta=meta,
+            lazy_init=True)
+        # 'task_name' not in dataset.meta
+        assert dataset.meta == dict(
+            dataset_type=dataset_type, classes=('dog', ))
+
+        # test whether self.base_dataset.META is changed when a customize
+        # dataset inherit self.base_dataset
+        # test reset META in ToyDataset.
+        class ToyDataset(self.base_dataset):
+            META = dict(xxx='xxx')
+
+        assert ToyDataset.META == dict(xxx='xxx')
+        assert self.base_dataset.META == dict(
+            dataset_type=dataset_type, classes=('dog', 'cat', 'fish'))
+
+        # test update META in ToyDataset.
+        class ToyDataset(self.base_dataset):
+            self.base_dataset.META['classes'] = ('bird', )
+
+        assert ToyDataset.META == dict(
+            dataset_type=dataset_type, classes=('bird', ))
+        assert self.base_dataset.META == dict(
+            dataset_type=dataset_type, classes=('dog', 'cat', 'fish'))
+
+        # set self.base_dataset to initial state
+        self.__init__()
+
+    @pytest.mark.parametrize('lazy_init', [True, False])
+    def test_length(self, lazy_init):
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            lazy_init=lazy_init)
+
+        if not lazy_init:
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+            assert len(dataset) == 2
+        else:
+            # test `__len__()` when lazy_init is True
+            assert not dataset._fully_initialized
+            assert not hasattr(dataset, 'data_infos')
+            # call `full_init()` automatically
+            assert len(dataset) == 2
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+
+    @pytest.mark.parametrize('lazy_init', [True, False])
+    def test_getitem(self, lazy_init):
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            lazy_init=lazy_init)
+
+        if not lazy_init:
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+            assert dataset[0] == dict(imgs=self.imgs)
+        else:
+            # test `__getitem__()` when lazy_init is True
+            assert not dataset._fully_initialized
+            assert not hasattr(dataset, 'data_infos')
+            # call `full_init()` automatically
+            assert dataset[0] == dict(imgs=self.imgs)
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+
+    @pytest.mark.parametrize('lazy_init', [True, False])
+    def test_get_data_info(self, lazy_init):
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            lazy_init=lazy_init)
+
+        if not lazy_init:
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+            assert dataset.get_data_info(0) == self.data_info
+        else:
+            # test `get_data_info()` when lazy_init is True
+            assert not dataset._fully_initialized
+            assert not hasattr(dataset, 'data_infos')
+            # call `full_init()` automatically
+            assert dataset.get_data_info(0) == self.data_info
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+
+    @pytest.mark.parametrize('lazy_init', [True, False])
+    def test_full_init(self, lazy_init):
+        dataset = self.base_dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            lazy_init=lazy_init)
+
+        if not lazy_init:
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+            assert len(dataset) == 2
+            assert dataset[0] == dict(imgs=self.imgs)
+            assert dataset.get_data_info(0) == self.data_info
+        else:
+            # test `full_init()` when lazy_init is True
+            assert not dataset._fully_initialized
+            assert not hasattr(dataset, 'data_infos')
+            # call `full_init()` manually
+            dataset.full_init()
+            assert dataset._fully_initialized
+            assert hasattr(dataset, 'data_infos')
+            assert len(dataset) == 2
+            assert dataset[0] == dict(imgs=self.imgs)
+            assert dataset.get_data_info(0) == self.data_info
+
+
+class TestConcatDataset:
+
+    def __init__(self):
+        dataset = BaseDataset
+
+        # create dataset_a
+        data_info = dict(filename='test_img.jpg', height=604, width=640)
+        dataset.parse_annotations = MagicMock(return_value=data_info)
+        imgs = torch.rand((2, 3, 32, 32))
+        dataset.pipeline = MagicMock(return_value=dict(imgs=imgs))
+        self.dataset_a = dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+
+        # create dataset_b
+        data_info = dict(filename='gray.jpg', height=288, width=512)
+        dataset.parse_annotations = MagicMock(return_value=data_info)
+        imgs = torch.rand((2, 3, 32, 32))
+        dataset.pipeline = MagicMock(return_value=dict(imgs=imgs))
+        self.dataset_b = dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json',
+            meta=dict(classes=('dog', 'cat')))
+
+        # test init
+        self.cat_datasets = ConcatDataset(
+            datasets=[self.dataset_a, self.dataset_b])
+
+    def test_meta(self):
+        assert self.cat_datasets.meta == self.dataset_a.meta
+        # meta of self.cat_datasets is from the first dataset when
+        # concatnating datasets with different metas.
+        assert self.cat_datasets.meta != self.dataset_b.meta
+
+    def test_length(self):
+        assert len(self.cat_datasets) == (
+            len(self.dataset_a) + len(self.dataset_b))
+
+    def test_getitem(self):
+        assert self.cat_datasets[0] == self.dataset_a[0]
+        assert self.cat_datasets[0] != self.dataset_b[0]
+
+        assert self.cat_datasets[-1] == self.dataset_b[-1]
+        assert self.cat_datasets[-1] != self.dataset_a[-1]
+
+    def test_get_data_info(self):
+        assert self.cat_datasets.get_data_info(
+            0) == self.dataset_a.get_data_info(0)
+        assert self.cat_datasets.get_data_info(
+            0) != self.dataset_b.get_data_info(0)
+
+        assert self.cat_datasets.get_data_info(
+            -1) == self.dataset_b.get_data_info(-1)
+        assert self.cat_datasets.get_data_info(
+            -1) != self.dataset_a[-1].get_data_info(-1)
+
+
+class TestRepeatDataset:
+
+    def __init__(self):
+        dataset = BaseDataset
+        data_info = dict(filename='test_img.jpg', height=604, width=640)
+        dataset.parse_annotations = MagicMock(return_value=data_info)
+        imgs = torch.rand((2, 3, 32, 32))
+        dataset.pipeline = MagicMock(return_value=dict(imgs=imgs))
+        self.dataset = dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+
+        self.repeat_times = 5
+        # test init
+        self.repeat_datasets = RepeatDataset(
+            dataset=self.dataset, times=self.repeat_times)
+
+    def test_meta(self):
+        assert self.repeat_datasets.meta == self.dataset.meta
+
+    def test_length(self):
+        assert len(
+            self.repeat_datasets) == len(self.dataset) * self.repeat_times
+
+    def test_getitem(self):
+        for i in range(self.repeat_times):
+            assert self.repeat_datasets[len(self.dataset) *
+                                        i] == self.dataset[0]
+
+    def test_get_data_info(self):
+        for i in range(self.repeat_times):
+            assert self.repeat_datasets.get_data_info(
+                len(self.dataset) * i) == self.dataset.get_data_info(0)
+
+
+class TestClassBalancedDataset:
+
+    def __init__(self):
+        dataset = BaseDataset
+        data_info = dict(filename='test_img.jpg', height=604, width=640)
+        dataset.parse_annotations = MagicMock(return_value=data_info)
+        imgs = torch.rand((2, 3, 32, 32))
+        dataset.pipeline = MagicMock(return_value=dict(imgs=imgs))
+        dataset.get_cat_ids = MagicMock(return_value=[0])
+        self.dataset = dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/'),
+            data_prefix=dict(img='imgs'),
+            ann_file='annotations/dummy_annotation.json')
+
+        self.repeat_indices = [0, 0, 1, 1, 1]
+        # test init
+        self.cls_banlanced_datasets = ClassBalancedDataset(
+            dataset=self.dataset, oversample_thr=1e-3)
+        self.cls_banlanced_datasets.repeat_indices = self.repeat_indices
+
+    def test_meta(self):
+        assert self.cls_banlanced_datasets.meta == self.dataset.meta
+
+    def test_length(self):
+        assert len(self.cls_banlanced_datasets) == len(self.repeat_indices)
+
+    def test_getitem(self):
+        for i in range(len(self.repeat_indices)):
+            assert self.cls_banlanced_datasets[i] == self.dataset[
+                self.repeat_indices[i]]
+
+    def test_get_data_info(self):
+        for i in range(len(self.repeat_indices)):
+            assert self.cls_banlanced_datasets.get_data_info(
+                i) == self.dataset.get_data_info(self.repeat_indices[i])
+
+    def test_get_cat_ids(self):
+        for i in range(len(self.repeat_indices)):
+            assert self.cls_banlanced_datasets.get_cat_ids(
+                i) == self.dataset.get_cat_ids(self.repeat_indices[i])