diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py index 7c43a2888c..04eb90a0b6 100644 --- a/mindspore/dataset/text/__init__.py +++ b/mindspore/dataset/text/__init__.py @@ -24,7 +24,7 @@ from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm __all__ = [ "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", - "to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", + "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", "PythonTokenizer" ] @@ -33,4 +33,4 @@ if platform.system().lower() != 'windows': RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer, PythonTokenizer __all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8", - "RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"]) + "RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer"]) diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index c50b8e4f75..8b0d47df25 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -119,10 +119,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): the dictionary can be obtained on the official website of cppjieba. mp_path (str): the dictionary file is used by MPSegment algorithm, the dictionary can be obtained on the official website of cppjieba. - mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm, - "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm, - "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm - (default="MIX"). + mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, + JiebaMode.MIX](default=JiebaMode.MIX). + + - JiebaMode.MP, tokenize with MPSegment algorithm. + - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. + - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. """ @check_jieba_init @@ -287,10 +289,16 @@ if platform.system().lower() != 'windows': Apply normalize operation on utf-8 string tensor. Args: - normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD". - If set "NONE", will do nothing for input string tensor. - If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC"). - See http://unicode.org/reports/tr15/ for details. + normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE, + NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, + NormalizeForm.NFKD](default=NormalizeForm.NFKC). + And you can see http://unicode.org/reports/tr15/ for details. + + - NormalizeForm.NONE, do nothing for input string tensor. + - NormalizeForm.NFC, normalize with Normalization Form C. + - NormalizeForm.NFKC, normalize with Normalization Form KC. + - NormalizeForm.NFD, normalize with Normalization Form D. + - NormalizeForm.NFKD, normalize with Normalization Form KD. """ def __init__(self, normalize_form=NormalizeForm.NFKC): diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index 766de76e01..7347a4b854 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -24,6 +24,9 @@ import mindspore._c_dataengine as cde from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset +__all__ = [ + "Vocab", "to_str", "to_bytes" +] class Vocab(cde.Vocab): """