!11623 fix issues of MD

From: @luoyang42
Reviewed-by: 
Signed-off-by:
pull/11623/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit f2fd357ffc

@ -288,14 +288,17 @@ def check_sampler_shuffle_shard_options(param_dict):
"""
shuffle, sampler = param_dict.get('shuffle'), param_dict.get('sampler')
num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
num_samples = param_dict.get('num_samples')
type_check(sampler, (type(None), samplers.BuiltinSampler, samplers.Sampler), "sampler")
if sampler is not None:
if shuffle is not None:
raise RuntimeError("sampler and shuffle cannot be specified at the same time.")
if num_shards is not None:
if num_shards is not None or shard_id is not None:
raise RuntimeError("sampler and sharding cannot be specified at the same time.")
if num_samples is not None:
raise RuntimeError("sampler and num_samples cannot be specified at the same time.")
if num_shards is not None:
check_pos_int32(num_shards)

@ -3045,7 +3045,8 @@ class ImageFolderDataset(MappableDataset):
unique index starting from 0).
decode (bool, optional): Decode the images after reading (default=False).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -3194,7 +3195,8 @@ class MnistDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -3277,6 +3279,7 @@ class MindDataset(MappableDataset):
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, performs shuffle).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
sampler (Sampler, optional): Object used to choose samples from the
@ -3750,7 +3753,8 @@ class GeneratorDataset(MappableDataset):
sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
input is required (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' will not used. Random accessible input is required.
Random accessible input is required. When this argument is specified, 'num_samples' reflects the max sample
number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
when num_shards is also specified. Random accessible input is required.
python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This
@ -3925,7 +3929,8 @@ class TFRecordDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
shard_equal_rows (bool, optional): Get equal rows for all shards(default=False). If shard_equal_rows
@ -4118,7 +4123,8 @@ class ManifestDataset(MappableDataset):
class will be given a unique index starting from 0).
decode (bool, optional): decode the images after reading (default=False).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4282,7 +4288,8 @@ class Cifar10Dataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4423,7 +4430,8 @@ class Cifar100Dataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4513,7 +4521,8 @@ class RandomDataset(SourceDataset):
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
"""
@ -4769,7 +4778,8 @@ class VOCDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -4965,7 +4975,8 @@ class CocoDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5118,7 +5129,8 @@ class CelebADataset(MappableDataset):
num_samples (int, optional): The number of images to be included in the dataset.
(default=None, all images).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5232,6 +5244,7 @@ class CLUEDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5466,6 +5479,7 @@ class CSVDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5578,6 +5592,7 @@ class TextFileDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@ -5782,7 +5797,8 @@ class NumpySlicesDataset(GeneratorDataset):
sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
input is required (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' will not used. Random accessible input is required.
Random accessible input is required. When this argument is specified, 'num_samples' reflects the max
sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
when num_shards is also specified. Random accessible input is required.

@ -137,6 +137,20 @@ class BuiltinSampler:
pass
def add_child(self, sampler):
"""
Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
output of parent sampler and apply its sample logic to return new samples.
Args:
sampler (Sampler): Object used to choose samples from the dataset. Only builtin
samplers(DistributedSampler, PKSampler, RandomSampler, SequentialSampler,
SubsetRandomSampler, WeightedRandomSampler) are supported.
Examples:
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
>>> sampler.add_child(ds.RandomSampler(num_samples=2))
>>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
"""
self.child_sampler = sampler
def get_child(self):
@ -448,7 +462,7 @@ class SequentialSampler(BuiltinSampler):
Samples the dataset elements sequentially, same as not having a sampler.
Args:
start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
start_index (int, optional): Index to start sampling at. (default=None, start at first ID)
num_samples (int, optional): Number of elements to sample (default=None, all elements).
Examples:

@ -232,9 +232,9 @@ def test_add_sampler_invalid_input():
assert "not an instance of a sampler" in str(info.value)
sampler = ds.SequentialSampler()
with pytest.raises(ValueError) as info:
with pytest.raises(RuntimeError) as info:
data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20)
assert "Conflicting arguments during sampler assignments" in str(info.value)
assert "sampler and num_samples cannot be specified at the same time" in str(info.value)
def test_distributed_sampler_invalid_offset():

Loading…
Cancel
Save