dataset API docstring: Update datasets, samplers, graphdata and text

pull/6227/head
Cathy Wong 5 years ago
parent d0e49c5cf8
commit f7adf648e9

File diff suppressed because it is too large Load Diff

@ -34,29 +34,36 @@ class GraphData:
Reads the graph dataset used for GNN training from the shared file and database. Reads the graph dataset used for GNN training from the shared file and database.
Args: Args:
dataset_file (str): One of file names in dataset. dataset_file (str): One of file names in the dataset.
num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel num_parallel_workers (int, optional): Number of workers to process the dataset in parallel
(default=None). (default=None).
working_mode (str, optional): Set working mode, now support 'local'/'client'/'server' (default='local'). working_mode (str, optional): Set working mode, now supports 'local'/'client'/'server' (default='local').
- 'local', used in non-distributed training scenarios. - 'local', used in non-distributed training scenarios.
- 'client', used in distributed training scenarios, the client does not load data, - 'client', used in distributed training scenarios. The client does not load data,
but obtains data from the server. but obtains data from the server.
- 'server', used in distributed training scenarios, the server loads the data - 'server', used in distributed training scenarios. The server loads the data
and is available to the client. and is available to the client.
hostname (str, optional): Valid when working_mode is set to 'client' or 'server', hostname (str, optional): Hostname of the graph data server. This parameter is only valid when
set the hostname of the graph data server (default='127.0.0.1'). working_mode is set to 'client' or 'server' (default='127.0.0.1').
port (int, optional): Valid when working_mode is set to 'client' or 'server', port (int, optional): Port of the graph data server. The range is 1024-65535. This parameter is
set the port of the graph data server, the range is 1024-65535 (default=50051). only valid when working_mode is set to 'client' or 'server' (default=50051).
num_client (int, optional): Valid when working_mode is set to 'server', num_client (int, optional): Maximum number of clients expected to connect to the server. The server will
set the number of clients expected to connect, and the server will allocate corresponding allocate resources according to this parameter. This parameter is only valid when working_mode
resources according to this parameter (default=1). is set to 'server' (default=1).
auto_shutdown (bool, optional): Valid when working_mode is set to 'server', auto_shutdown (bool, optional): Valid when working_mode is set to 'server',
when the number of connected clients reaches num_client and no client is being connected, when the number of connected clients reaches num_client and no client is being connected,
the server automatically exits (default=True). the server automatically exits (default=True).
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> features = data_graph.get_node_feature(nodes, [1])
""" """
@check_gnn_graphdata @check_gnn_graphdata
@ -94,10 +101,11 @@ class GraphData:
node_type (int): Specify the type of node. node_type (int): Specify the type of node.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0) >>> nodes = data_graph.get_all_nodes(0)
@ -121,6 +129,7 @@ class GraphData:
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_edges(0) >>> nodes = data_graph.get_all_edges(0)
@ -140,7 +149,7 @@ class GraphData:
edge_list (Union[list, numpy.ndarray]): The given list of edges. edge_list (Union[list, numpy.ndarray]): The given list of edges.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Raises: Raises:
TypeError: If `edge_list` is not list or ndarray. TypeError: If `edge_list` is not list or ndarray.
@ -159,10 +168,11 @@ class GraphData:
neighbor_type (int): Specify the type of neighbor. neighbor_type (int): Specify the type of neighbor.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0) >>> nodes = data_graph.get_all_nodes(0)
>>> neighbors = data_graph.get_all_neighbors(nodes, 0) >>> neighbors = data_graph.get_all_neighbors(nodes, 0)
@ -192,13 +202,14 @@ class GraphData:
neighbor_types (Union[list, numpy.ndarray]): Neighbor type sampled per hop. neighbor_types (Union[list, numpy.ndarray]): Neighbor type sampled per hop.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0) >>> nodes = data_graph.get_all_nodes(0)
>>> neighbors = data_graph.get_all_neighbors(nodes, [2, 2], [0, 0]) >>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0])
Raises: Raises:
TypeError: If `node_list` is not list or ndarray. TypeError: If `node_list` is not list or ndarray.
@ -221,10 +232,11 @@ class GraphData:
neg_neighbor_type (int): Specify the type of negative neighbor. neg_neighbor_type (int): Specify the type of negative neighbor.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0) >>> nodes = data_graph.get_all_nodes(0)
>>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0) >>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
@ -253,6 +265,7 @@ class GraphData:
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0) >>> nodes = data_graph.get_all_nodes(0)
>>> features = data_graph.get_node_feature(nodes, [1]) >>> features = data_graph.get_node_feature(nodes, [1])
@ -284,6 +297,7 @@ class GraphData:
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> edges = data_graph.get_all_edges(0) >>> edges = data_graph.get_all_edges(0)
>>> features = data_graph.get_edge_feature(edges, [1]) >>> features = data_graph.get_edge_feature(edges, [1])
@ -334,10 +348,11 @@ class GraphData:
A default value of -1 indicates that no node is given. A default value of -1 indicates that no node is given.
Returns: Returns:
numpy.ndarray: array of nodes. numpy.ndarray: Array of nodes.
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2) >>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1]) >>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])

@ -13,10 +13,10 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
""" """
Sampler module provides several samplers to generate sampling data from dataset. The sampler module provides several samplers to generate data from datasets.
There are following samplers: DistributedSampler, PKSampler, RandomSampler, The provided samplers include: DistributedSampler, PKSampler, RandomSampler,
SequentialSampler, SubsetRandomSampler, WeightedRandomSampler. SequentialSampler, SubsetRandomSampler, and WeightedRandomSampler.
User can also define custom sampler by extending from Sampler class. Users can also define a custom sampler by extending from the Sampler class.
""" """
import numpy as np import numpy as np
@ -26,9 +26,9 @@ import mindspore.dataset as ds
class Sampler: class Sampler:
""" """
Base class for user defined sampler. Base class for user defined sampler.
User defined sampler can be used with any existing dataset with sampler support. A user defined sampler can be used with any existing dataset with sampler support.
An required _iter_() method should by overridden by user for sample index generation. A required _iter_() method should by overridden by the user for sample index generation.
An optional reset() method can be overridden for per repeat reset, An optional reset() method can be overridden for per repeat reset,
dataset_size and num_samples will be set by dataset once a dataset iterator is created. dataset_size and num_samples will be set by dataset once a dataset iterator is created.
@ -52,8 +52,7 @@ class Sampler:
def __iter__(self): def __iter__(self):
""" """
User defined iterator, must be overridden. User defined iterator, must be overridden.
_handshake is guaranteed to be called prior to iterator construction _handshake is guaranteed to be called prior to iterator construction.
""" """
raise NotImplementedError raise NotImplementedError
@ -160,10 +159,10 @@ class BuiltinSampler:
def get_num_samples(self): def get_num_samples(self):
""" """
All samplers can contain a numeric num_samples value (or it could be set to None). All samplers can contain a numeric num_samples value (or it can be set to None).
Child sampler can exist or be None. A child sampler can exist or be None.
if child sampler exists, then the child sampler count can be a numeric value or None. If a child sampler exists, then the child sampler count can be a numeric value or None.
Given these conditions, we need to output what the sampler count is for this sampler. These conditions impact the resultant sampler count that is used.
The following table shows the possible results from calling this function. The following table shows the possible results from calling this function.
.. list-table:: .. list-table::
@ -217,20 +216,20 @@ class BuiltinSampler:
class DistributedSampler(BuiltinSampler): class DistributedSampler(BuiltinSampler):
""" """
Sampler that access a shard of the dataset. A sampler that accesses a shard of the dataset.
Args: Args:
num_shards (int): Number of shards to divide the dataset into. num_shards (int): Number of shards to divide the dataset into.
shard_id (int): Shard ID of the current shard within num_shards. shard_id (int): Shard ID of the current shard within num_shards.
shuffle (bool, optional): If true, the indices are shuffled (default=True). shuffle (bool, optional): If True, the indices are shuffled (default=True).
num_samples (int, optional): The number of samples to draw (default=None, all elements). num_samples (int, optional): The number of samples to draw (default=None, all elements).
offset(int, optional): Offset from shard when the element of dataset is allocated offset(int, optional): Offset from shard when the element of dataset is allocated (default=-1).
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>> >>>
>>> dataset_dir = "path/to/imagefolder_directory" >>> dataset_dir = "path/to/imagefolder_directory"
>>> >>>
>>> # creates a distributed sampler with 10 shards total. This shard is shard 5 >>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
>>> sampler = ds.DistributedSampler(10, 5) >>> sampler = ds.DistributedSampler(10, 5)
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
@ -304,8 +303,8 @@ class PKSampler(BuiltinSampler):
Args: Args:
num_val (int): Number of elements to sample for each class. num_val (int): Number of elements to sample for each class.
num_class (int, optional): Number of classes to sample (default=None, all classes). num_class (int, optional): Number of classes to sample (default=None, all classes).
shuffle (bool, optional): If true, the class IDs are shuffled (default=False). shuffle (bool, optional): If True, the class IDs are shuffled (default=False).
class_column (str, optional): Name of column to classify dataset(default='label'), for MindDataset. class_column (str, optional): Name of column with class labels for MindDataset (default='label').
num_samples (int, optional): The number of samples to draw (default=None, all elements). num_samples (int, optional): The number of samples to draw (default=None, all elements).
Examples: Examples:
@ -372,6 +371,7 @@ class PKSampler(BuiltinSampler):
c_sampler.add_child(c_child_sampler) c_sampler.add_child(c_child_sampler)
return c_sampler return c_sampler
class RandomSampler(BuiltinSampler): class RandomSampler(BuiltinSampler):
""" """
Samples the elements randomly. Samples the elements randomly.
@ -437,7 +437,7 @@ class SequentialSampler(BuiltinSampler):
Samples the dataset elements sequentially, same as not having a sampler. Samples the dataset elements sequentially, same as not having a sampler.
Args: Args:
start_index (int, optional): Index to start sampling at. (dafault=None starts at first id) start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
num_samples (int, optional): Number of elements to sample (default=None, all elements). num_samples (int, optional): Number of elements to sample (default=None, all elements).
Examples: Examples:

@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
This module is to support text processing for nlp. It includes two parts: This module is to support text processing for NLP. It includes two parts:
transforms and utils. transforms is a high performance transforms and utils. transforms is a high performance
nlp text processing module which is developed with icu4c and cppjieba. NLP text processing module which is developed with ICU4C and cppjieba.
utils provides some general methods for nlp text processing. utils provides some general methods for NLP text processing.
""" """
import platform import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \

File diff suppressed because it is too large Load Diff

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
The module text.utils provides some general methods for nlp text processing. The module text.utils provides some general methods for NLP text processing.
For example, you can use Vocab to build a dictionary, For example, you can use Vocab to build a dictionary,
use to_bytes and to_str to encode and decode strings into a specified format. use to_bytes and to_str to encode and decode strings into a specified format.
""" """

@ -131,7 +131,7 @@ def test_cv_minddataset_pk_sample_error_class_column():
create_cv_mindrecord(1) create_cv_mindrecord(1)
columns_list = ["data", "file_name", "label"] columns_list = ["data", "file_name", "label"]
num_readers = 4 num_readers = 4
sampler = ds.PKSampler(5, None, True, 'no_exsit_column') sampler = ds.PKSampler(5, None, True, 'no_exist_column')
with pytest.raises(Exception, match="MindRecordOp launch failed"): with pytest.raises(Exception, match="MindRecordOp launch failed"):
data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, sampler=sampler) data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, sampler=sampler)
num_iter = 0 num_iter = 0

Loading…
Cancel
Save