File size: 5,473 Bytes
f549064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List

from mmengine import get_file_backend, list_from_file

from mmcls.registry import DATASETS
from .base_dataset import BaseDataset
from .categories import CUB_CATEGORIES


@DATASETS.register_module()
class CUB(BaseDataset):
    """The CUB-200-2011 Dataset.

    Support the `CUB-200-2011 <http://www.vision.caltech.edu/visipedia/CUB-200-2011.html>`_ Dataset.
    Comparing with the `CUB-200 <http://www.vision.caltech.edu/visipedia/CUB-200.html>`_ Dataset,
    there are much more pictures in `CUB-200-2011`. After downloading and decompression, the dataset
    directory structure is as follows.

    CUB dataset directory: ::

        CUB-200-2011 (data_root)/
        β”œβ”€β”€ images (data_prefix)
        β”‚   β”œβ”€β”€ class_x
        β”‚   β”‚   β”œβ”€β”€ xx1.jpg
        β”‚   β”‚   β”œβ”€β”€ xx2.jpg
        β”‚   β”‚   └── ...
        β”‚   β”œβ”€β”€ class_y
        β”‚   β”‚   β”œβ”€β”€ yy1.jpg
        β”‚   β”‚   β”œβ”€β”€ yy2.jpg
        β”‚   β”‚   └── ...
        β”‚   └── ...
        β”œβ”€β”€ images.txt (ann_file)
        β”œβ”€β”€ image_class_labels.txt (image_class_labels_file)
        β”œβ”€β”€ train_test_split.txt (train_test_split_file)
        └── ....

    Args:
        data_root (str): The root directory for CUB-200-2011 dataset.
        test_mode (bool): ``test_mode=True`` means in test phase. It determines
             to use the training set or test set.
        ann_file (str, optional): Annotation file path, path relative to
            ``data_root``. Defaults to 'images.txt'.
        data_prefix (str): Prefix for iamges, path relative to
            ``data_root``. Defaults to 'images'.
        image_class_labels_file (str, optional): The label file, path
            relative to ``data_root``. Defaults to 'image_class_labels.txt'.
        train_test_split_file (str, optional): The split file  to split train
            and test dataset, path relative to ``data_root``.
            Defaults to 'train_test_split_file.txt'.


    Examples:
        >>> from mmcls.datasets import CUB
        >>> cub_train_cfg = dict(data_root='data/CUB_200_2011', test_mode=True)
        >>> cub_train = CUB(**cub_train_cfg)
        >>> cub_train
        Dataset CUB
        Number of samples:  5994
        Number of categories:       200
        Root of dataset:    data/CUB_200_2011
        >>> cub_test_cfg = dict(data_root='data/CUB_200_2011', test_mode=True)
        >>> cub_test = CUB(**cub_test_cfg)
        >>> cub_test
        Dataset CUB
        Number of samples:  5794
        Number of categories:       200
        Root of dataset:    data/CUB_200_2011
    """  # noqa: E501

    METAINFO = {'classes': CUB_CATEGORIES}

    def __init__(self,
                 data_root: str,
                 test_mode: bool,
                 ann_file: str = 'images.txt',
                 data_prefix: str = 'images',
                 image_class_labels_file: str = 'image_class_labels.txt',
                 train_test_split_file: str = 'train_test_split.txt',
                 **kwargs):
        self.backend = get_file_backend(data_root, enable_singleton=True)
        self.image_class_labels_file = self.backend.join_path(
            data_root, image_class_labels_file)
        self.train_test_split_file = self.backend.join_path(
            data_root, train_test_split_file)
        super(CUB, self).__init__(
            ann_file=ann_file,
            data_root=data_root,
            data_prefix=data_prefix,
            test_mode=test_mode,
            **kwargs)

    def _load_data_from_txt(self, filepath):
        """load data from CUB txt file, the every line of the file is idx and a
        data item."""
        pairs = list_from_file(filepath)
        data_dict = dict()
        for pair in pairs:
            idx, data_item = pair.split()
            # all the index starts from 1 in CUB files,
            # here we need to '- 1' to let them start from 0.
            data_dict[int(idx) - 1] = data_item
        return data_dict

    def load_data_list(self):
        """Load images and ground truth labels."""
        sample_dict = self._load_data_from_txt(self.ann_file)

        label_dict = self._load_data_from_txt(self.image_class_labels_file)

        split_dict = self._load_data_from_txt(self.train_test_split_file)

        assert sample_dict.keys() == label_dict.keys() == split_dict.keys(),\
            f'sample_ids should be same in files {self.ann_file}, ' \
            f'{self.image_class_labels_file} and {self.train_test_split_file}'

        data_list = []
        for sample_id in sample_dict.keys():
            if split_dict[sample_id] == '1' and self.test_mode:
                # skip train samples when test_mode=True
                continue
            elif split_dict[sample_id] == '0' and not self.test_mode:
                # skip test samples when test_mode=False
                continue

            img_path = self.backend.join_path(self.img_prefix,
                                              sample_dict[sample_id])
            gt_label = int(label_dict[sample_id]) - 1
            info = dict(img_path=img_path, gt_label=gt_label)
            data_list.append(info)

        return data_list

    def extra_repr(self) -> List[str]:
        """The extra repr information of the dataset."""
        body = [
            f'Root of dataset: \t{self.data_root}',
        ]
        return body