!11623 fix issues of MD

From: @luoyang42 Reviewed-by: Signed-off-by:
4 years ago · f2fd357ffc
parent e02b6852cb 415c8b08a5
commit f2fd357ffc
4 changed files with 49 additions and 16 deletions
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@ -288,14 +288,17 @@ def check_sampler_shuffle_shard_options(param_dict):
    """
    shuffle, sampler = param_dict.get('shuffle'), param_dict.get('sampler')
    num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
+    num_samples = param_dict.get('num_samples')

    type_check(sampler, (type(None), samplers.BuiltinSampler, samplers.Sampler), "sampler")

    if sampler is not None:
        if shuffle is not None:
            raise RuntimeError("sampler and shuffle cannot be specified at the same time.")
-        if num_shards is not None:
+        if num_shards is not None or shard_id is not None:
            raise RuntimeError("sampler and sharding cannot be specified at the same time.")
+        if num_samples is not None:
+            raise RuntimeError("sampler and num_samples cannot be specified at the same time.")

    if num_shards is not None:
        check_pos_int32(num_shards)
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -3045,7 +3045,8 @@ class ImageFolderDataset(MappableDataset):
            unique index starting from 0).
        decode (bool, optional): Decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -3194,7 +3195,8 @@ class MnistDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -3277,6 +3279,7 @@ class MindDataset(MappableDataset):
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
            (default=None, performs shuffle).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
+            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        sampler (Sampler, optional): Object used to choose samples from the
@ -3750,7 +3753,8 @@ class GeneratorDataset(MappableDataset):
        sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
            input is required (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
-            When this argument is specified, 'num_samples' will not used. Random accessible input is required.
+            Random accessible input is required. When this argument is specified, 'num_samples' reflects the max sample
+            number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
            when num_shards is also specified. Random accessible input is required.
        python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This
@ -3925,7 +3929,8 @@ class TFRecordDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        shard_equal_rows (bool, optional): Get equal rows for all shards(default=False). If shard_equal_rows
@ -4118,7 +4123,8 @@ class ManifestDataset(MappableDataset):
            class will be given a unique index starting from 0).
        decode (bool, optional): decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4282,7 +4288,8 @@ class Cifar10Dataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4423,7 +4430,8 @@ class Cifar100Dataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4513,7 +4521,8 @@ class RandomDataset(SourceDataset):
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
    """
@ -4769,7 +4778,8 @@ class VOCDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4965,7 +4975,8 @@ class CocoDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5118,7 +5129,8 @@ class CelebADataset(MappableDataset):
        num_samples (int, optional): The number of images to be included in the dataset.
            (default=None, all images).
        num_shards (int, optional): Number of shards that the dataset will be divided
-            into (default=None).
+            into (default=None). When this argument is specified, 'num_samples' reflects
+            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5232,6 +5244,7 @@ class CLUEDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
+            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5466,6 +5479,7 @@ class CSVDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
+            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5578,6 +5592,7 @@ class TextFileDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
+            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5782,7 +5797,8 @@ class NumpySlicesDataset(GeneratorDataset):
        sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
            input is required (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
-            When this argument is specified, 'num_samples' will not used. Random accessible input is required.
+            Random accessible input is required. When this argument is specified, 'num_samples' reflects the max
+            sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
            when num_shards is also specified. Random accessible input is required.

--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@ -137,6 +137,20 @@ class BuiltinSampler:
        pass

    def add_child(self, sampler):
+        """
+        Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
+        output of parent sampler and apply its sample logic to return new samples.
+
+        Args:
+            sampler (Sampler): Object used to choose samples from the dataset. Only builtin
+                samplers(DistributedSampler, PKSampler, RandomSampler, SequentialSampler,
+                SubsetRandomSampler, WeightedRandomSampler) are supported.
+
+        Examples:
+            >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
+            >>> sampler.add_child(ds.RandomSampler(num_samples=2))
+            >>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
+        """
        self.child_sampler = sampler

    def get_child(self):
@ -448,7 +462,7 @@ class SequentialSampler(BuiltinSampler):
    Samples the dataset elements sequentially, same as not having a sampler.

    Args:
-        start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
+        start_index (int, optional): Index to start sampling at. (default=None, start at first ID)
        num_samples (int, optional): Number of elements to sample (default=None, all elements).

    Examples:
--- a/tests/ut/python/dataset/test_sampler.py
+++ b/tests/ut/python/dataset/test_sampler.py
@ -232,9 +232,9 @@ def test_add_sampler_invalid_input():
    assert "not an instance of a sampler" in str(info.value)

    sampler = ds.SequentialSampler()
-    with pytest.raises(ValueError) as info:
+    with pytest.raises(RuntimeError) as info:
        data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20)
-    assert "Conflicting arguments during sampler assignments" in str(info.value)
+    assert "sampler and num_samples cannot be specified at the same time" in str(info.value)


 def test_distributed_sampler_invalid_offset():