From cd3206c26ccc0a1ef9f11aa3af9e5e302ef20ef2 Mon Sep 17 00:00:00 2001 From: Xiao Tianci Date: Tue, 9 Mar 2021 17:42:55 +0800 Subject: [PATCH] fix error examples in docs of dataset --- mindspore/dataset/engine/datasets.py | 106 +++++++++++++++-------- mindspore/dataset/engine/graphdata.py | 44 +++++----- mindspore/dataset/text/transforms.py | 4 +- mindspore/dataset/vision/c_transforms.py | 5 +- 4 files changed, 94 insertions(+), 65 deletions(-) diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 749f38fa57..4ed1d95cc6 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -507,7 +507,7 @@ class Dataset: Examples: >>> # use NumpySliceDataset as an example - >>> dataset = ds.NumpySliceDataset([[0, 1], [2, 3]]) + >>> dataset = ds.NumpySlicesDataset([[0, 1], [2, 3]]) >>> >>> def flat_map_func(array): ... # create a NumpySliceDataset with the array @@ -638,7 +638,7 @@ class Dataset: >>> >>> # Rename the column outputted by random_jitter_op to "image_mapped". >>> # Specifying column order works in the same way as examples in 1). - >>> dataset = dataset.map(operation=[decode_op, random_jitter_op], input_columns=["image"], + >>> dataset = dataset.map(operations=[decode_op, random_jitter_op], input_columns=["image"], ... output_columns=["image_mapped"]) >>> >>> # Map with multiple operations using pyfunc. Renaming columns and specifying column order @@ -2878,16 +2878,18 @@ class ImageFolderDataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> image_folder_dataset_dir = "/path/to/image_folder_dataset_directory" + >>> >>> # 1) Read all samples (image files) in image_folder_dataset_dir with 8 threads - >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, + >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir, ... num_parallel_workers=8) >>> >>> # 2) Read all samples (image files) from folder cat and folder dog with label 0 and 1 - >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, + >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir, ... class_indexing={"cat":0, "dog":1}) >>> >>> # 3) Read all samples (image files) in image_folder_dataset_dir with extensions .JPEG and .png (case sensitive) - >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, + >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir, ... extensions=[".JPEG", ".png"]) """ @@ -2985,8 +2987,11 @@ class MnistDataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> mnist_dataset_dir = "/path/to/mnist_dataset_directory" + >>> >>> # Read 3 samples from MNIST dataset >>> dataset = ds.MnistDataset(dataset_dir=mnist_dataset_dir, num_samples=3) + >>> >>> # Note: In mnist_dataset dataset, each dictionary has keys "image" and "label" """ @@ -3034,6 +3039,10 @@ class MindDataset(MappableDataset): Raises: ValueError: If num_shards is specified but shard_id is None. ValueError: If shard_id is specified but num_shards is None. + + Examples: + >>> mind_dataset_dir = ["/path/to/mind_dataset_file"] # contains 1 or multiple MindRecord files + >>> dataset = ds.MindDataset(dataset_file=mind_dataset_dir) """ def parse(self, children=None): @@ -3425,14 +3434,14 @@ class GeneratorDataset(MappableDataset): ... for i in range(64): ... yield (np.array([[i, i + 1], [i + 2, i + 3]]),) >>> - >>> dataset = ds.GeneratorDataset(generator_multidimensional, ["multi_dimensional_data"]) + >>> dataset = ds.GeneratorDataset(source=generator_multidimensional, column_names=["multi_dimensional_data"]) >>> >>> # 2) Multi-column generator function as callable input. >>> def generator_multi_column(): ... for i in range(64): - ... yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]) + ... yield np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]) >>> - >>> dataset = ds.GeneratorDataset(generator_multi_column, ["col1", "col2"]) + >>> dataset = ds.GeneratorDataset(source=generator_multi_column, column_names=["col1", "col2"]) >>> >>> # 3) Iterable dataset as iterable input. >>> class MyIterable: @@ -3450,12 +3459,13 @@ class GeneratorDataset(MappableDataset): ... return item ... ... def __iter__(self): + ... self._index = 0 ... return self ... ... def __len__(self): ... return len(self._data) >>> - >>> dataset = ds.GeneratorDataset(MyIterable(), ["data", "label"]) + >>> dataset = ds.GeneratorDataset(source=MyIterable(), column_names=["data", "label"]) >>> >>> # 4) Random accessible dataset as random accessible input. >>> class MyAccessible: @@ -3469,10 +3479,10 @@ class GeneratorDataset(MappableDataset): ... def __len__(self): ... return len(self._data) >>> - >>> dataset = ds.GeneratorDataset(MyAccessible(), ["data", "label"]) + >>> dataset = ds.GeneratorDataset(source=MyAccessible(), column_names=["data", "label"]) >>> >>> # list, dict, tuple of Python is also random accessible - >>> dataset = ds.GeneratorDataset([(np.array(0),), (np.array(1),), (np.array(2),)], ["col"]) + >>> dataset = ds.GeneratorDataset(source=[(np.array(0),), (np.array(1),), (np.array(2),)], column_names=["col"]) """ @check_generatordataset @@ -3590,17 +3600,17 @@ class TFRecordDataset(SourceDataset): Examples: >>> import mindspore.common.dtype as mstype >>> - >>> tfrecord_dataset_dir = ["/path/to/tfrecord_dataset_file"] # contains 1 or multiple tf data files + >>> tfrecord_dataset_dir = ["/path/to/tfrecord_dataset_file"] # contains 1 or multiple TFRecord files >>> tfrecord_schema_file = "/path/to/tfrecord_schema_file" >>> >>> # 1) Get all rows from tfrecord_dataset_dir with no explicit schema. >>> # The meta-data in the first row will be used as a schema. - >>> dataset = ds.TFRecordDataset(tfrecord_dataset_dir) + >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir) >>> >>> # 2) Get all rows from tfrecord_dataset_dir with user-defined schema. >>> schema = ds.Schema() - >>> schema.add_column('col_1d', de_type=mstype.int64, shape=[2]) - >>> dataset = ds.TFRecordDataset(tfrecord_dataset_dir, schema=schema) + >>> schema.add_column(name='col_1d', de_type=mstype.int64, shape=[2]) + >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema=schema) >>> >>> # 3) Get all rows from tfrecord_dataset_dir with schema file. >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema=tfrecord_schema_file) @@ -3697,13 +3707,13 @@ class ManifestDataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> manifest_dataset_dir = "/path/to/manifest_dataset_file" + >>> >>> # 1) Read all samples specified in manifest_dataset_dir dataset with 8 threads for training - >>> dataset = ds.ManifestDataset(manifest_dataset_dir, usage="train", num_parallel_workers=8) + >>> dataset = ds.ManifestDataset(dataset_file=manifest_dataset_dir, usage="train", num_parallel_workers=8) >>> - >>> # 2) Read samples (specified in manifest_file.manifest) for shard 0 - >>> # in a 2-way distributed training setup - >>> dataset = ds.ManifestDataset(manifest_dataset_dir, num_shards=2, shard_id=0) - + >>> # 2) Read samples (specified in manifest_file.manifest) for shard 0 in a 2-way distributed training setup + >>> dataset = ds.ManifestDataset(dataset_file=manifest_dataset_dir, num_shards=2, shard_id=0) """ @check_manifestdataset @@ -3815,6 +3825,8 @@ class Cifar10Dataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> cifar10_dataset_dir = "/path/to/cifar10_dataset_directory" + >>> >>> # 1) Get all samples from CIFAR10 dataset in sequence >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, shuffle=False) >>> @@ -3920,6 +3932,8 @@ class Cifar100Dataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> cifar100_dataset_dir = "/path/to/cifar100_dataset_directory" + >>> >>> # 1) Get all samples from CIFAR100 dataset in sequence >>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, shuffle=False) >>> @@ -3999,7 +4013,7 @@ class Schema: >>> >>> # Create schema; specify column name, mindspore.dtype and shape of the column >>> schema = ds.Schema() - >>> schema.add_column('col1', de_type=mstype.int64, shape=[2]) + >>> schema.add_column(name='col1', de_type=mstype.int64, shape=[2]) """ @check_schema @@ -4190,17 +4204,21 @@ class VOCDataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> voc_dataset_dir = "/path/to/voc_dataset_directory" + >>> >>> # 1) Read VOC data for segmentatation training - >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Segmentation", usage="train") + >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Segmentation", usage="train") >>> >>> # 2) Read VOC data for detection training - >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train") + >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train") >>> >>> # 3) Read all VOC dataset samples in voc_dataset_dir with 8 threads in random order - >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", num_parallel_workers=8) + >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train", + ... num_parallel_workers=8) >>> >>> # 4) Read then decode all VOC dataset samples in voc_dataset_dir in sequence - >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", decode=True, shuffle=False) + >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train", + ... decode=True, shuffle=False) >>> >>> # In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" >>> # In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" @@ -4344,17 +4362,28 @@ class CocoDataset(MappableDataset): ValueError: If shard_id is invalid (< 0 or >= num_shards). Examples: + >>> coco_dataset_dir = "/path/to/coco_dataset_directory/images" + >>> coco_annotation_file = "/path/to/coco_dataset_directory/annotation_file" + >>> >>> # 1) Read COCO data for Detection task - >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Detection') + >>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir, + ... annotation_file=coco_annotation_file, + ... task='Detection') >>> >>> # 2) Read COCO data for Stuff task - >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Stuff') + >>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir, + ... annotation_file=coco_annotation_file, + ... task='Stuff') >>> >>> # 3) Read COCO data for Panoptic task - >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Panoptic') + >>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir, + ... annotation_file=coco_annotation_file, + ... task='Panoptic') >>> >>> # 4) Read COCO data for Keypoint task - >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Keypoint') + >>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir, + ... annotation_file=coco_annotation_file, + ... task='Keypoint') >>> >>> # In COCO dataset, each dictionary has keys "image" and "annotation" """ @@ -4445,6 +4474,7 @@ class CelebADataset(MappableDataset): (default=None, which means no cache is used). Examples: + >>> celeba_dataset_dir = "/path/to/celeba_dataset_directory" >>> dataset = ds.CelebADataset(dataset_dir=celeba_dataset_dir, usage='train') """ @@ -4519,7 +4549,7 @@ class CLUEDataset(SourceDataset): (default=None, which means no cache is used). Examples: - >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple text files + >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files >>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train') """ @@ -4647,7 +4677,7 @@ class CSVDataset(SourceDataset): Examples: - >>> csv_dataset_dir = ["/path/to/csv_dataset_file"] + >>> csv_dataset_dir = ["/path/to/csv_dataset_file"] # contains 1 or multiple csv files >>> dataset = ds.CSVDataset(dataset_files=csv_dataset_dir, column_names=['col1', 'col2', 'col3', 'col4']) """ @@ -4696,7 +4726,7 @@ class TextFileDataset(SourceDataset): (default=None, which means no cache is used). Examples: - >>> # contains 1 or multiple text files + >>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files >>> dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir) """ @@ -4833,20 +4863,20 @@ class NumpySlicesDataset(GeneratorDataset): Examples: >>> # 1) Input data can be a list >>> data = [1, 2, 3] - >>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1"]) + >>> dataset = ds.NumpySlicesDataset(data=data, column_names=["column_1"]) >>> >>> # 2) Input data can be a dictionary, and column_names will be its keys >>> data = {"a": [1, 2], "b": [3, 4]} - >>> dataset = ds.NumpySlicesDataset(data) + >>> dataset = ds.NumpySlicesDataset(data=data) >>> >>> # 3) Input data can be a tuple of lists (or NumPy arrays), each tuple element refers to data in each column >>> data = ([1, 2], [3, 4], [5, 6]) - >>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"]) + >>> dataset = ds.NumpySlicesDataset(data=data, column_names=["column_1", "column_2", "column_3"]) >>> >>> # 4) Load data from CSV file >>> import pandas as pd - >>> df = pd.read_csv(csv_dataset_dir[0]) - >>> dataset = ds.NumpySlicesDataset(dict(df), shuffle=False) + >>> df = pd.read_csv(filepath_or_buffer=csv_dataset_dir[0]) + >>> dataset = ds.NumpySlicesDataset(data=dict(df), shuffle=False) """ @check_numpyslicesdataset @@ -4893,7 +4923,7 @@ class PaddedDataset(GeneratorDataset): Examples: >>> import numpy as np >>> data = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}] - >>> dataset = ds.PaddedDataset(data) + >>> dataset = ds.PaddedDataset(padded_samples=data) """ @check_paddeddataset diff --git a/mindspore/dataset/engine/graphdata.py b/mindspore/dataset/engine/graphdata.py index a8a29ccdbe..f0ff2dba4a 100644 --- a/mindspore/dataset/engine/graphdata.py +++ b/mindspore/dataset/engine/graphdata.py @@ -72,9 +72,10 @@ class GraphData: the server automatically exits (default=True). Examples: - >>> graph_dataset = ds.GraphData(graph_dataset_dir, 2) - >>> nodes = graph_dataset.get_all_nodes(1) - >>> features = graph_dataset.get_node_feature(nodes, [1]) + >>> graph_dataset_dir = "/path/to/graph_dataset_file" + >>> graph_dataset = ds.GraphData(dataset_file=graph_dataset_dir, num_parallel_workers=2) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> features = graph_dataset.get_node_feature(node_list=nodes, feature_types=[1]) """ @check_gnn_graphdata @@ -114,7 +115,7 @@ class GraphData: numpy.ndarray, array of nodes. Examples: - >>> nodes = graph_dataset.get_all_nodes(1) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) Raises: TypeError: If `node_type` is not integer. @@ -135,7 +136,7 @@ class GraphData: numpy.ndarray, array of edges. Examples: - >>> edges = graph_dataset.get_all_edges(0) + >>> edges = graph_dataset.get_all_edges(edge_type=0) Raises: TypeError: If `edge_type` is not integer. @@ -175,8 +176,8 @@ class GraphData: numpy.ndarray, array of neighbors. Examples: - >>> nodes = graph_dataset.get_all_nodes(1) - >>> neighbors = graph_dataset.get_all_neighbors(nodes, 2) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> neighbors = graph_dataset.get_all_neighbors(node_list=nodes, neighbor_type=2) Raises: TypeError: If `node_list` is not list or ndarray. @@ -211,8 +212,9 @@ class GraphData: numpy.ndarray, array of neighbors. Examples: - >>> nodes = graph_dataset.get_all_nodes(1) - >>> neighbors = graph_dataset.get_sampled_neighbors(nodes, [2, 2], [2, 1]) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> neighbors = graph_dataset.get_sampled_neighbors(node_list=nodes, neighbor_nums=[2, 2], + ... neighbor_types=[2, 1]) Raises: TypeError: If `node_list` is not list or ndarray. @@ -240,8 +242,9 @@ class GraphData: numpy.ndarray, array of neighbors. Examples: - >>> nodes = graph_dataset.get_all_nodes(1) - >>> neg_neighbors = graph_dataset.get_neg_sampled_neighbors(nodes, 5, 2) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> neg_neighbors = graph_dataset.get_neg_sampled_neighbors(node_list=nodes, neg_neighbor_num=5, + ... neg_neighbor_type=2) Raises: TypeError: If `node_list` is not list or ndarray. @@ -266,8 +269,8 @@ class GraphData: numpy.ndarray, array of features. Examples: - >>> nodes = graph_dataset.get_all_nodes(1) - >>> features = graph_dataset.get_node_feature(nodes, [2, 3]) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> features = graph_dataset.get_node_feature(node_list=nodes, feature_types=[2, 3]) Raises: TypeError: If `node_list` is not list or ndarray. @@ -295,8 +298,8 @@ class GraphData: numpy.ndarray, array of features. Examples: - >>> edges = graph_dataset.get_all_edges(0) - >>> features = graph_dataset.get_edge_feature(edges, [1]) + >>> edges = graph_dataset.get_all_edges(edge_type=0) + >>> features = graph_dataset.get_edge_feature(edge_list=edges, feature_types=[1]) Raises: TypeError: If `edge_list` is not list or ndarray. @@ -325,13 +328,7 @@ class GraphData: return self._graph_data.graph_info() @check_gnn_random_walk - def random_walk( - self, - target_nodes, - meta_path, - step_home_param=1.0, - step_away_param=1.0, - default_node=-1): + def random_walk(self, target_nodes, meta_path, step_home_param=1.0, step_away_param=1.0, default_node=-1): """ Random walk in nodes. @@ -347,7 +344,8 @@ class GraphData: numpy.ndarray, array of nodes. Examples: - >>> nodes = graph_dataset.random_walk([1, 2], [1, 2, 1, 2, 1]) + >>> nodes = graph_dataset.get_all_nodes(node_type=1) + >>> walks = graph_dataset.random_walk(target_nodes=nodes, meta_path=[2, 1, 2]) Raises: TypeError: If `target_nodes` is not list or ndarray. diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 25ba4c85c4..2bdbc82cc5 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -459,8 +459,8 @@ class UnicodeCharTokenizer(TextTensorOperation): >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + ... output_columns=["token", "offsets_start", "offsets_limit"], + ... column_order=["token", "offsets_start", "offsets_limit"]) """ @check_with_offsets diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py index 42a0664cc4..aa8786e90e 100644 --- a/mindspore/dataset/vision/c_transforms.py +++ b/mindspore/dataset/vision/c_transforms.py @@ -62,6 +62,7 @@ class ImageTensorOperation(TensorOperation): """ Base class of Image Tensor Ops """ + def __call__(self, *input_tensor_list): for tensor in input_tensor_list: if not isinstance(tensor, (np.ndarray, Image.Image)): @@ -1142,8 +1143,8 @@ class RandomSelectSubpolicy(ImageTensorOperation): ... (c_vision.RandomColorAdjust(), 0.8)], ... [(c_vision.RandomRotation((90, 90)), 1), ... (c_vision.RandomColorAdjust(), 0.2)]] - >>> image_folder_dataset_1 = image_folder_dataset.map(operations=c_vision.RandomSelectSubpolicy(policy), - ... input_columns=["image"]) + >>> image_folder_dataset = image_folder_dataset.map(operations=c_vision.RandomSelectSubpolicy(policy), + ... input_columns=["image"]) """ @check_random_select_subpolicy_op