diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index c944c8824f..0cf40b1d3e 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -1732,10 +1732,7 @@ class MappableDataset(SourceDataset): new_sampler (Sampler): The sampler to use for the current dataset. Examples: - >>> # Note: A SequentialSampler is created by default - >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir) - >>> - >>> # Use a DistributedSampler instead of the SequentialSampler + >>> # use a DistributedSampler instead >>> new_sampler = ds.DistributedSampler(10, 2) >>> dataset.use_sampler(new_sampler) """ @@ -2888,15 +2885,15 @@ class MnistDataset(MappableDataset): The generated dataset has two columns ['image', 'label']. The type of the image tensor is uint8. The label is a scalar uint32 tensor. - This dataset can take in a sampler. 'sampler' and 'shuffle' are mutually exclusive. The table + This dataset can take in a sampler. `sampler` and `shuffle` are mutually exclusive. The table below shows what input arguments are allowed and their expected behavior. .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle' :widths: 25 25 50 :header-rows: 1 - * - Parameter 'sampler' - - Parameter 'shuffle' + * - Parameter `sampler` + - Parameter `shuffle` - Expected Order Behavior * - None - None @@ -2937,19 +2934,19 @@ class MnistDataset(MappableDataset): dataset_dir (str): Path to the root directory that contains the dataset. usage (str, optional): Usage of this dataset, can be "train", "test" or "all" . "train" will read from 60,000 train samples, "test" will read from 10,000 test samples, "all" will read from all 70,000 samples. - (default=None, all samples) + (default=None, will read all samples) num_samples (int, optional): The number of images to be included in the dataset - (default=None, all images). + (default=None, will read all images). num_parallel_workers (int, optional): Number of workers to read the data - (default=None, set in the config). + (default=None, will use value set in the config). shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, expected order behavior shown in the table). sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). - When this argument is specified, 'num_samples' reflects the max sample number of per shard. - shard_id (int, optional): The shard ID within num_shards (default=None). This - argument can only be specified when num_shards is also specified. + When this argument is specified, `num_samples` reflects the max sample number of per shard. + shard_id (int, optional): The shard ID within `num_shards` (default=None). This + argument can only be specified when `num_shards` is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. (default=None, which means no cache is used). @@ -3587,15 +3584,15 @@ class ManifestDataset(MappableDataset): The shape of the image column is [image_size] if decode flag is False, or [H,W,C] otherwise. The type of the image tensor is uint8. The label is a scalar uint64 tensor. - This dataset can take in a sampler. 'sampler' and 'shuffle' are mutually exclusive. The table + This dataset can take in a sampler. `sampler` and `shuffle` are mutually exclusive. The table below shows what input arguments are allowed and their expected behavior. - .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle' + .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle` :widths: 25 25 50 :header-rows: 1 - * - Parameter 'sampler' - - Parameter 'shuffle' + * - Parameter `sampler` + - Parameter `shuffle` - Expected Order Behavior * - None - None @@ -3618,11 +3615,11 @@ class ManifestDataset(MappableDataset): Args: dataset_file (str): File to be read. - usage (str, optional): acceptable usages include train, eval and inference (default="train"). + usage (str, optional): Acceptable usages include "train", "eval" and "inference" (default="train"). num_samples (int, optional): The number of images to be included in the dataset. - (default=None, all images). + (default=None, will include all images). num_parallel_workers (int, optional): Number of workers to read the data - (default=None, number set in the config). + (default=None, will use value set in the config). shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None, expected order behavior shown in the table). sampler (Sampler, optional): Object used to choose samples from the @@ -3632,10 +3629,10 @@ class ManifestDataset(MappableDataset): class will be given a unique index starting from 0). decode (bool, optional): decode the images after reading (default=False). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). When this argument is specified, 'num_samples' reflects + into (default=None). When this argument is specified, `num_samples` reflects the max sample number of per shard. - shard_id (int, optional): The shard ID within num_shards (default=None). This - argument can only be specified when num_shards is also specified. + shard_id (int, optional): The shard ID within `num_shards` (default=None). This + argument can only be specified when `num_shards` is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. (default=None, which means no cache is used). @@ -4195,7 +4192,8 @@ class CocoDataset(MappableDataset): """ A source dataset for reading and parsing COCO dataset. - CocoDataset support four kinds of task: 2017 Train/Val/Test Detection, Keypoints, Stuff, Panoptic. + `CocoDataset` supports four kinds of tasks, which are Object Detection, Keypoint Detection, Stuff Segmentation and + Panoptic Segmentation of 2017 Train/Val/Test dataset. The generated dataset has multi-columns : @@ -4339,11 +4337,12 @@ class CocoDataset(MappableDataset): class CelebADataset(MappableDataset): """ - A source dataset for reading and parsing CelebA dataset. Currently supported: list_attr_celeba.txt only. + A source dataset for reading and parsing CelebA dataset. Only support to read `list_attr_celeba.txt` currently, + which is the attribute annotations of the dataset. Note: The generated dataset has two columns ['image', 'attr']. - The type of the image tensor is uint8. The attribute tensor is uint32 and one hot type. + The image tensor is of the uint8 type. The attribute tensor is of the uint32 type and one hot encoded. Citation of CelebA dataset. @@ -4376,20 +4375,20 @@ class CelebADataset(MappableDataset): Args: dataset_dir (str): Path to the root directory that contains the dataset. - num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config). + num_parallel_workers (int, optional): Number of workers to read the data (default=None, will use value set in + the config). shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None). - usage (str): one of 'all', 'train', 'valid' or 'test'. + usage (str): one of 'all', 'train', 'valid' or 'test' (default='all', will read all samples). sampler (Sampler, optional): Object used to choose samples from the dataset (default=None). decode (bool, optional): decode the images after reading (default=False). - extensions (list[str], optional): List of file extensions to be - included in the dataset (default=None). - num_samples (int, optional): The number of images to be included in the dataset. - (default=None, all images). + extensions (list[str], optional): List of file extensions to be included in the dataset (default=None). + num_samples (int, optional): The number of images to be included in the dataset + (default=None, will include all images). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). When this argument is specified, 'num_samples' reflects + into (default=None). When this argument is specified, `num_samples` reflects the max sample number of per shard. - shard_id (int, optional): The shard ID within num_shards (default=None). This - argument can only be specified when num_shards is also specified. + shard_id (int, optional): The shard ID within `num_shards` (default=None). This + argument can only be specified when `num_shards` is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. (default=None, which means no cache is used). diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index a615ab4742..6ca5a54923 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -63,6 +63,7 @@ class TextTensorOperation(TensorOperation): """ Base class of Text Tensor Ops """ + def __call__(self, input_tensor): if not isinstance(input_tensor, list): input_list = [input_tensor] @@ -95,13 +96,11 @@ DE_C_INTER_JIEBA_MODE = { JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM } - DE_C_INTER_SENTENCEPIECE_LOADTYPE = { SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE, SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL } - DE_C_INTER_SENTENCEPIECE_OUTTYPE = { SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString, SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT @@ -281,7 +280,7 @@ class Lookup(TextTensorOperation): vocab (Vocab): A vocabulary object. unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV). If unknown_token is OOV, a runtime error will be thrown (default=None). - data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32) + data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mindspore.int32) Examples: >>> # Load vocabulary from list @@ -308,18 +307,19 @@ class Ngram(TextTensorOperation): Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works. Args: - n (list[int]): n in n-gram, n >= 1. n is a list of positive integers. For example, if n=[4,3], then the result + n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up - for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore","best"] will result in + for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in an empty string produced. - left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width - will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None). - right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence. - pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" - (default=None). - separator (str, optional): symbol used to join strings together. For example. if 2-gram is + left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width). + `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the + sequence with "__" (default=None). + right_pad (tuple, optional): Padding performed on right side of the sequence shaped like + ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("-", 2) + would pad right side of the sequence with "--" (default=None). + separator (str, optional): Symbol used to join strings together. For example. if 2-gram is ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"] - (default=None, which means whitespace is used). + (default=None, which will use whitespace as separator). Examples: >>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator="")) @@ -388,6 +388,7 @@ class SlidingWindow(TextTensorOperation): >>> # | [3,4,5]] | >>> # +--------------+ """ + @check_slidingwindow def __init__(self, width, axis=0): self.width = width @@ -556,6 +557,7 @@ class PythonTokenizer: tokens = self.tokenizer(in_array) return tokens + if platform.system().lower() != 'windows': DE_C_INTER_NORMALIZE_FORM = { NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, @@ -574,12 +576,12 @@ if platform.system().lower() != 'windows': BasicTokenizer is not supported on Windows platform yet. Args: - lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation + lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation on input text to fold the text to lower case and strip accents characters. If False, only apply - NormalizeUTF8('normalization_form' mode) operation on input text (default=False). - keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False). + NormalizeUTF8 operation with the specified mode on input text (default=False). + keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False). normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is - only effective when 'lower_case' is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). + only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). preserve_unused_token (bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). with_offsets (bool, optional): If or not output offsets of tokens (default=False). @@ -636,14 +638,14 @@ if platform.system().lower() != 'windows': vocab (Vocab): A vocabulary object. suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##'). max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). - unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, - return the token directly, else return 'unknown_token'(default='[UNK]'). - lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation + unknown_token (str, optional): When an unknown token is found, return the token directly if `unknown_token` + is an empty string, else return `unknown_token` instead (default='[UNK]'). + lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation on input text to fold the text to lower case and strip accented characters. If False, only apply - NormalizeUTF8('normalization_form' mode) operation on input text (default=False). + NormalizeUTF8 operation with the specified mode on input text (default=False). keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False). normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode, - only effective when 'lower_case' is False. See NormalizeUTF8 for details (default='NONE'). + only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). preserve_unused_token (bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). with_offsets (bool, optional): If or not output offsets of tokens (default=False). @@ -702,7 +704,8 @@ if platform.system().lower() != 'windows': class CaseFold(TextTensorOperation): """ - Apply case fold operation on UTF-8 string tensor. + Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into + lower case. Note: CaseFold is not supported on Windows platform yet. diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 78ede245d6..437dd09b0c 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -59,23 +59,24 @@ class OneHot(cde.OneHotOp): class Fill(cde.FillOp): """ - Tensor operation to create a tensor filled with input scalar value. + Tensor operation to fill all elements in the tensor with the specified value. The output tensor will have the same shape and type as the input tensor. Args: fill_value (Union[str, bytes, int, float, bool])) : scalar value - to fill created tensor with. + to fill the tensor with. Examples: >>> import numpy as np - >>> from mindspore.dataset import GeneratorDataset - >>> # Generate 1d int numpy array from 0 - 63 + >>> # generate a 1D integer numpy array from 0 to 4 >>> def generator_1d(): - >>> for i in range(64): + ... for i in range(5): ... yield (np.array([i]),) - >>> generator_dataset = GeneratorDataset(generator_1d,column_names='col') + >>> generator_dataset = ds.GeneratorDataset(generator_1d, column_names="col1") + >>> # [[0], [1], [2], [3], [4]] >>> fill_op = c_transforms.Fill(3) >>> generator_dataset = generator_dataset.map(operations=fill_op) + >>> # [[3], [3], [3], [3], [3]] """ @check_fill_value @@ -351,6 +352,8 @@ class Unique(cde.UniqueOp): >>> # +---------+-----------------+---------+ """ + + class Compose(): """ Compose a list of transforms into a single transform. @@ -376,6 +379,7 @@ class Compose(): operations.append(op) return cde.ComposeOperation(operations) + class RandomApply(): """ Randomly perform a series of transforms with a given probability. diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py index 93e109beff..88fac4d0d1 100644 --- a/mindspore/dataset/vision/c_transforms.py +++ b/mindspore/dataset/vision/c_transforms.py @@ -62,6 +62,7 @@ class ImageTensorOperation(TensorOperation): """ Base class of Image Tensor Ops """ + def __call__(self, input_tensor): if not isinstance(input_tensor, list): input_list = [input_tensor] @@ -93,11 +94,9 @@ DE_C_BORDER_TYPE = {Border.CONSTANT: cde.BorderType.DE_BORDER_CONSTANT, Border.REFLECT: cde.BorderType.DE_BORDER_REFLECT, Border.SYMMETRIC: cde.BorderType.DE_BORDER_SYMMETRIC} - DE_C_IMAGE_BATCH_FORMAT = {ImageBatchFormat.NHWC: cde.ImageBatchFormat.DE_IMAGE_BATCH_FORMAT_NHWC, ImageBatchFormat.NCHW: cde.ImageBatchFormat.DE_IMAGE_BATCH_FORMAT_NCHW} - DE_C_INTER_MODE = {Inter.NEAREST: cde.InterpolationMode.DE_INTER_NEAREST_NEIGHBOUR, Inter.LINEAR: cde.InterpolationMode.DE_INTER_LINEAR, Inter.CUBIC: cde.InterpolationMode.DE_INTER_CUBIC, @@ -307,6 +306,7 @@ class Equalize(ImageTensorOperation): >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list, ... input_columns=["image"]) """ + def parse(self): return cde.EqualizeOperation() @@ -337,6 +337,7 @@ class Invert(ImageTensorOperation): >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list, ... input_columns=["image"]) """ + def parse(self): return cde.InvertOperation() @@ -729,7 +730,7 @@ class RandomCrop(ImageTensorOperation): class RandomCropDecodeResize(ImageTensorOperation): """ - Equivalent to RandomResizedCrop, but crops before decodes. + A combination of `Crop`, `Decode` and `Resize`. It will get better performance for JPEG images. Args: size (Union[int, sequence]): The size of the output image. @@ -813,7 +814,7 @@ class RandomCropWithBBox(ImageTensorOperation): Examples: >>> decode_op = c_vision.Decode() - >>> random_crop_with_bbox_op = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) + >>> random_crop_with_bbox_op = c_vision.RandomCropWithBBox([512, 512], [200, 200, 200, 200]) >>> transforms_list = [decode_op, random_crop_with_bbox_op] >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list, ... input_columns=["image"])