dataset API docstring: Update datasets, samplers, graphdata and text

5 years ago · f7adf648e9
parent d0e49c5cf8
commit f7adf648e9
7 changed files with 423 additions and 412 deletions
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
--- a/mindspore/dataset/engine/graphdata.py
+++ b/mindspore/dataset/engine/graphdata.py
@ -34,29 +34,36 @@ class GraphData:
    Reads the graph dataset used for GNN training from the shared file and database.

    Args:
-        dataset_file (str): One of file names in dataset.
-        num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel
+        dataset_file (str): One of file names in the dataset.
+        num_parallel_workers (int, optional): Number of workers to process the dataset in parallel
            (default=None).
-        working_mode (str, optional): Set working mode, now support 'local'/'client'/'server' (default='local').
+        working_mode (str, optional): Set working mode, now supports 'local'/'client'/'server' (default='local').

            - 'local', used in non-distributed training scenarios.

-            - 'client', used in distributed training scenarios, the client does not load data,
+            - 'client', used in distributed training scenarios. The client does not load data,
              but obtains data from the server.

-            - 'server', used in distributed training scenarios, the server loads the data
+            - 'server', used in distributed training scenarios. The server loads the data
              and is available to the client.

-        hostname (str, optional): Valid when working_mode is set to 'client' or 'server',
-            set the hostname of the graph data server (default='127.0.0.1').
-        port (int, optional): Valid when working_mode is set to 'client' or 'server',
-            set the port of the graph data server, the range is 1024-65535 (default=50051).
-        num_client (int, optional): Valid when working_mode is set to 'server',
-            set the number of clients expected to connect, and the server will allocate corresponding
-            resources according to this parameter (default=1).
+        hostname (str, optional): Hostname of the graph data server. This parameter is only valid when
+            working_mode is set to 'client' or 'server' (default='127.0.0.1').
+        port (int, optional): Port of the graph data server. The range is 1024-65535. This parameter is
+            only valid when working_mode is set to 'client' or 'server' (default=50051).
+        num_client (int, optional): Maximum number of clients expected to connect to the server. The server will
+            allocate resources according to this parameter. This parameter is only valid when working_mode
+            is set to 'server' (default=1).
        auto_shutdown (bool, optional): Valid when working_mode is set to 'server',
            when the number of connected clients reaches num_client and no client is being connected,
            the server automatically exits (default=True).
+
+    Examples:
+        >>> import mindspore.dataset as ds
+        >>>
+        >>> data_graph = ds.GraphData('dataset_file', 2)
+        >>> nodes = data_graph.get_all_nodes(0)
+        >>> features = data_graph.get_node_feature(nodes, [1])
    """

    @check_gnn_graphdata
@ -94,10 +101,11 @@ class GraphData:
            node_type (int): Specify the type of node.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_nodes(0)

@ -121,6 +129,7 @@ class GraphData:

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_edges(0)

@ -140,7 +149,7 @@ class GraphData:
            edge_list (Union[list, numpy.ndarray]): The given list of edges.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Raises:
            TypeError: If `edge_list` is not list or ndarray.
@ -159,10 +168,11 @@ class GraphData:
            neighbor_type (int): Specify the type of neighbor.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_nodes(0)
            >>> neighbors = data_graph.get_all_neighbors(nodes, 0)
@ -192,13 +202,14 @@ class GraphData:
            neighbor_types (Union[list, numpy.ndarray]): Neighbor type sampled per hop.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_nodes(0)
-            >>> neighbors = data_graph.get_all_neighbors(nodes, [2, 2], [0, 0])
+            >>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0])

        Raises:
            TypeError: If `node_list` is not list or ndarray.
@ -221,10 +232,11 @@ class GraphData:
            neg_neighbor_type (int): Specify the type of negative neighbor.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_nodes(0)
            >>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
@ -253,6 +265,7 @@ class GraphData:

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.get_all_nodes(0)
            >>> features = data_graph.get_node_feature(nodes, [1])
@ -284,6 +297,7 @@ class GraphData:

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> edges = data_graph.get_all_edges(0)
            >>> features = data_graph.get_edge_feature(edges, [1])
@ -334,10 +348,11 @@ class GraphData:
                A default value of -1 indicates that no node is given.

        Returns:
-            numpy.ndarray: array of nodes.
+            numpy.ndarray: Array of nodes.

        Examples:
            >>> import mindspore.dataset as ds
+            >>>
            >>> data_graph = ds.GraphData('dataset_file', 2)
            >>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])

--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@ -13,10 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 """
-Sampler module provides several samplers to generate sampling data from dataset.
-There are following samplers: DistributedSampler, PKSampler, RandomSampler,
-SequentialSampler, SubsetRandomSampler, WeightedRandomSampler.
-User can also define custom sampler by extending from Sampler class.
+The sampler module provides several samplers to generate data from datasets.
+The provided samplers include: DistributedSampler, PKSampler, RandomSampler,
+SequentialSampler, SubsetRandomSampler, and WeightedRandomSampler.
+Users can also define a custom sampler by extending from the Sampler class.
 """

 import numpy as np
@ -26,9 +26,9 @@ import mindspore.dataset as ds
 class Sampler:
    """
    Base class for user defined sampler.
-    User defined sampler can be used with any existing dataset with sampler support.
+    A user defined sampler can be used with any existing dataset with sampler support.

-    An required  _iter_() method should by overridden by user for sample index generation.
+    A required  _iter_() method should by overridden by the user for sample index generation.
    An optional reset() method can be overridden for per repeat reset,

    dataset_size and num_samples will be set by dataset once a dataset iterator is created.
@ -52,8 +52,7 @@ class Sampler:
    def __iter__(self):
        """
        User defined iterator, must be overridden.
-        _handshake is guaranteed to be called prior to iterator construction
-
+        _handshake is guaranteed to be called prior to iterator construction.
        """
        raise NotImplementedError

@ -160,10 +159,10 @@ class BuiltinSampler:

    def get_num_samples(self):
        """
-        All samplers can contain a numeric num_samples value (or it could be set to None).
-        Child sampler can exist or be None.
-        if child sampler exists, then the child sampler count can be a numeric value or None.
-        Given these conditions, we need to output what the sampler count is for this sampler.
+        All samplers can contain a numeric num_samples value (or it can be set to None).
+        A child sampler can exist or be None.
+        If a child sampler exists, then the child sampler count can be a numeric value or None.
+        These conditions impact the resultant sampler count that is used.
        The following table shows the possible results from calling this function.

        .. list-table::
@ -217,20 +216,20 @@ class BuiltinSampler:

 class DistributedSampler(BuiltinSampler):
    """
-    Sampler that access a shard of the dataset.
+    A sampler that accesses a shard of the dataset.

    Args:
        num_shards (int): Number of shards to divide the dataset into.
        shard_id (int): Shard ID of the current shard within num_shards.
-        shuffle (bool, optional): If true, the indices are shuffled (default=True).
+        shuffle (bool, optional): If True, the indices are shuffled (default=True).
        num_samples (int, optional): The number of samples to draw (default=None, all elements).
-        offset(int, optional): Offset from shard when the element of dataset is allocated
+        offset(int, optional): Offset from shard when the element of dataset is allocated (default=-1).
    Examples:
        >>> import mindspore.dataset as ds
        >>>
        >>> dataset_dir = "path/to/imagefolder_directory"
        >>>
-        >>> # creates a distributed sampler with 10 shards total. This shard is shard 5
+        >>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
        >>> sampler = ds.DistributedSampler(10, 5)
        >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)

@ -304,8 +303,8 @@ class PKSampler(BuiltinSampler):
    Args:
        num_val (int): Number of elements to sample for each class.
        num_class (int, optional): Number of classes to sample (default=None, all classes).
-        shuffle (bool, optional): If true, the class IDs are shuffled (default=False).
-        class_column (str, optional): Name of column to classify dataset(default='label'), for MindDataset.
+        shuffle (bool, optional): If True, the class IDs are shuffled (default=False).
+        class_column (str, optional): Name of column with class labels for MindDataset (default='label').
        num_samples (int, optional): The number of samples to draw (default=None, all elements).

    Examples:
@ -372,6 +371,7 @@ class PKSampler(BuiltinSampler):
        c_sampler.add_child(c_child_sampler)
        return c_sampler

+
 class RandomSampler(BuiltinSampler):
    """
    Samples the elements randomly.
@ -437,7 +437,7 @@ class SequentialSampler(BuiltinSampler):
    Samples the dataset elements sequentially, same as not having a sampler.

    Args:
-        start_index (int, optional): Index to start sampling at. (dafault=None starts at first id)
+        start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
        num_samples (int, optional): Number of elements to sample (default=None, all elements).

    Examples:
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This module is to support text processing for nlp. It includes two parts:
+This module is to support text processing for NLP. It includes two parts:
 transforms and utils. transforms is a high performance
-nlp text processing module which is developed with icu4c and cppjieba.
-utils provides some general methods for nlp text processing.
+NLP text processing module which is developed with ICU4C and cppjieba.
+utils provides some general methods for NLP text processing.
 """
 import platform
 from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-The module text.utils provides some general methods for nlp text processing.
+The module text.utils provides some general methods for NLP text processing.
 For example, you can use Vocab to build a dictionary,
 use to_bytes and to_str to encode and decode strings into a specified format.
 """
--- a/tests/ut/python/dataset/test_minddataset_exception.py
+++ b/tests/ut/python/dataset/test_minddataset_exception.py
@ -131,7 +131,7 @@ def test_cv_minddataset_pk_sample_error_class_column():
    create_cv_mindrecord(1)
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
-    sampler = ds.PKSampler(5, None, True, 'no_exsit_column')
+    sampler = ds.PKSampler(5, None, True, 'no_exist_column')
    with pytest.raises(Exception, match="MindRecordOp launch failed"):
        data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, sampler=sampler)
        num_iter = 0