diff --git a/mindspore/dataset/__init__.py b/mindspore/dataset/__init__.py index 0bacb30928..b21d1b94aa 100644 --- a/mindspore/dataset/__init__.py +++ b/mindspore/dataset/__init__.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -This module provides APIs to load and process various datasets: MNIST, -CIFAR-10, CIFAR-100, VOC, ImageNet, CelebA dataset, etc. It also supports -datasets in special format, including mindrecord, tfrecord, manifest. Users -can also create samplers with this module to sample data. +This module provides APIs to load and process various common datasets such as MNIST, +CIFAR-10, CIFAR-100, VOC, ImageNet, CelebA, etc. It also supports datasets in standard +format, including MindRecord, TFRecord, Manifest, etc. Users can also define their own +datasets with this module. + +Besides, this module provides APIs to sample data while loading. + +Please notice that cache is not supported on Windows platform yet. Please do not use it +while loading and processing data on Windows. """ from .core import config diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index a8ec4ff560..af91ec9f14 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -418,7 +418,7 @@ if platform.system().lower() != 'windows': Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. Note: - The WhitespaceTokenizer is not supported on windows platform yet. + WhitespaceTokenizer is not supported on Windows platform yet. Args: with_offsets (bool, optional): If or not output offsets of tokens (default=False). @@ -449,7 +449,7 @@ if platform.system().lower() != 'windows': Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. Note: - The UnicodeScriptTokenizer is not supported on windows platform yet. + UnicodeScriptTokenizer is not supported on Windows platform yet. Args: keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). @@ -482,7 +482,7 @@ if platform.system().lower() != 'windows': Apply case fold operation on utf-8 string tensor. Note: - The CaseFold is not supported on windows platform yet. + CaseFold is not supported on Windows platform yet. Examples: >>> import mindspore.dataset.text as text @@ -505,7 +505,7 @@ if platform.system().lower() != 'windows': Apply normalize operation on utf-8 string tensor. Note: - The NormalizeUTF8 is not supported on windows platform yet. + NormalizeUTF8 is not supported on Windows platform yet. Args: normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE, @@ -541,7 +541,7 @@ if platform.system().lower() != 'windows': See http://userguide.icu-project.org/strings/regexp for support regex pattern. Note: - The RegexReplace is not supported on windows platform yet. + RegexReplace is not supported on Windows platform yet. Args: pattern (str): the regex expression patterns. @@ -572,7 +572,7 @@ if platform.system().lower() != 'windows': See http://userguide.icu-project.org/strings/regexp for support regex pattern. Note: - The RegexTokenizer is not supported on windows platform yet. + RegexTokenizer is not supported on Windows platform yet. Args: delim_pattern (str): The pattern of regex delimiters. @@ -610,7 +610,7 @@ if platform.system().lower() != 'windows': Tokenize a scalar tensor of UTF-8 string by specific rules. Note: - The BasicTokenizer is not supported on windows platform yet. + BasicTokenizer is not supported on Windows platform yet. Args: lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation @@ -666,7 +666,7 @@ if platform.system().lower() != 'windows': Tokenizer used for Bert text process. Note: - The BertTokenizer is not supported on windows platform yet. + BertTokenizer is not supported on Windows platform yet. Args: vocab (Vocab): A vocabulary object.