diff --git a/include/api/dual_abi_helper.h b/include/api/dual_abi_helper.h index 7d56d5ac72..e1cc1744c4 100644 --- a/include/api/dual_abi_helper.h +++ b/include/api/dual_abi_helper.h @@ -134,6 +134,15 @@ inline std::vector>> ClassIndexCharT return ret; } +inline std::vector, int64_t>> PairStringInt64ToPairCharInt64( + const std::vector> &s) { + std::vector, int64_t>> ret; + std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) { + return std::pair, int64_t>(std::vector(str.first.begin(), str.first.end()), str.second); + }); + return ret; +} + template inline std::map, T> PadInfoStringToChar(const std::map &s_pad_info) { std::map, T> ret; diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc index f3a17f1e6c..a831f3e8d0 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) { })); })); -// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ -PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "WordpieceTokenizerOp") - .def(py::init &, const std::string &, const int &, const std::string &, - const bool &>()); +PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>(*m, + "WordpieceTokenizerOperation") + .def(py::init([](const std::shared_ptr &vocab, const std::string &suffix_indicator, + int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) { + auto wordpiece_tokenizer = std::make_shared( + vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets); + THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams()); + return wordpiece_tokenizer; + })); })); PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 1918b79513..ad444abdfb 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -15,6 +15,8 @@ */ #include +#include +#include #include "minddata/dataset/include/text.h" @@ -131,7 +133,7 @@ std::shared_ptr JiebaTokenizer::Parse() { return jieba_tokenizer; } -Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { +Status JiebaTokenizer::AddWordChar(const std::vector &word, int64_t freq) { if (word.empty()) { std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; MS_LOG(ERROR) << err_msg; @@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } - data_->words_list_.emplace_back(word, freq); + data_->words_list_.emplace_back(CharToString(word), freq); + return Status::OK(); +} + +Status JiebaTokenizer::AddDictChar(const std::vector, int64_t>> &user_dict) { + for (auto &word_freq_pair : user_dict) { + RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second)); + } + return Status::OK(); +} + +Status JiebaTokenizer::AddDictChar(const std::vector &file_path) { + std::vector> user_dict; + RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict)); + RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict))); + return Status::OK(); +} + +Status JiebaTokenizer::ParserFile(const std::string &file_path, + std::vector> *const user_dict) { + std::ifstream ifs(file_path); + if (!ifs) { + std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + std::string line; + while (std::getline(ifs, line)) { + if (line.empty()) { + continue; + } + std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$"); + std::smatch tokens; + std::regex_match(line, tokens, regex); + if (std::regex_match(line, tokens, regex)) { + if (tokens.size() == 2) { + user_dict->emplace_back(tokens.str(1), 0); + } else if (tokens.size() == 3) { + user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0)); + } else { + continue; + } + } else { + continue; + } + } + MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size(); + MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):"; + for (std::size_t i = 0; i != user_dict->size(); ++i) { + if (i >= 10) break; + MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second; + } return Status::OK(); } @@ -310,6 +364,32 @@ std::shared_ptr UnicodeCharTokenizer::Parse() { return std::make_shared(data_->with_offsets_); } +// WordpieceTokenizer +struct WordpieceTokenizer::Data { + Data(const std::shared_ptr &vocab, const std::vector &suffix_indicator, int32_t max_bytes_per_token, + const std::vector &unknown_token, bool with_offsets) + : vocab_(vocab), + suffix_indicator_(CharToString(suffix_indicator)), + max_bytes_per_token_(max_bytes_per_token), + unknown_token_(CharToString(unknown_token)), + with_offsets_(with_offsets) {} + std::shared_ptr vocab_; + std::string suffix_indicator_; + int32_t max_bytes_per_token_; + std::string unknown_token_; + bool with_offsets_; +}; + +WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr &vocab, const std::vector &suffix_indicator, + int32_t max_bytes_per_token, const std::vector &unknown_token, + bool with_offsets) + : data_(std::make_shared(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {} + +std::shared_ptr WordpieceTokenizer::Parse() { + return std::make_shared( + data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_); +} + #ifndef _WIN32 // UnicodeScriptTokenizer struct UnicodeScriptTokenizer::Data { diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index b55402e0f8..7afff95a93 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform { /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', /// '[MASK]' (default=true). - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, bool with_offsets = false); @@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform { /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', /// '[MASK]' (default=true). - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit BertTokenizer(const std::shared_ptr &vocab, const std::string &suffix_indicator = "##", int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", bool lower_case = false, bool keep_whitespace = false, @@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform { /// - JiebaMode.kMP, tokenize with MPSegment algorithm. /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm. /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm. - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false) : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} @@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform { /// \brief Destructor ~JiebaTokenizer() = default; - Status AddWord(const std::string &word, int64_t freq = 0); + /// \brief Add user defined word to JiebaTokenizer's dictionary. + /// \param[in] word The word to be added to the JiebaTokenizer instance. + /// The added word will not be written into the built-in dictionary on disk. + /// \param[in] freq The frequency of the word to be added. The higher the frequency, + /// the better chance the word will be tokenized (default=None, use default frequency). + Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); } + + /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary. + /// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary. + Status AddDict(const std::vector> &user_dict) { + return AddDictChar(PairStringInt64ToPairCharInt64(user_dict)); + } + + /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file. + /// Only valid word-freq pairs in user provided file will be added into the dictionary. + /// Rows containing invalid input will be ignored, no error nor warning Status is returned. + /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs. + Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); } protected: /// \brief Function to convert TensorTransform object into a TensorOperation object. @@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform { std::shared_ptr Parse() override; private: + /// \brief Parser user defined word by file. + /// \param[in] file_path Path to the user defined file. + /// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file. + Status ParserFile(const std::string &file_path, std::vector> *const user_dict); + + /// \brief Used to translate all API string to vector of char and back + Status AddWordChar(const std::vector &word, int64_t freq = 0); + + /// \brief Used to translate all API string to vector of char and back + Status AddDictChar(const std::vector, int64_t>> &user_dict); + + /// \brief Used to translate all API string to vector of char and back + Status AddDictChar(const std::vector &file_path); + struct Data; std::shared_ptr data_; }; @@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform { /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be /// matched by 'keep_delim_pattern'. The default value is an empty string ("") /// which means that delimiters will not be kept as an output token (default=""). - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false) : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} @@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform { class UnicodeCharTokenizer final : public TensorTransform { public: /// \brief Constructor. - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit UnicodeCharTokenizer(bool with_offsets = false); /// \brief Destructor @@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform { std::shared_ptr data_; }; +/// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens. +class WordpieceTokenizer final : public TensorTransform { + public: + /// \brief Constructor. + /// \param[in] vocab A Vocab object. + /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##'). + /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). + /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty + /// string, else return the string specified (default='[UNK]'). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). + explicit WordpieceTokenizer(const std::shared_ptr &vocab, const std::string &suffix_indicator = "##", + int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", + bool with_offsets = false) + : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), + with_offsets) {} + + explicit WordpieceTokenizer(const std::shared_ptr &vocab, const std::vector &suffix_indicator, + int32_t max_bytes_per_token, const std::vector &unknown_token, bool with_offsets); + + /// \brief Destructor + ~WordpieceTokenizer() = default; + + protected: + /// \brief Function to convert TensorTransform object into a TensorOperation object. + /// \return Shared pointer to TensorOperation object. + std::shared_ptr Parse() override; + + private: + struct Data; + std::shared_ptr data_; +}; + #ifndef _WIN32 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. class UnicodeScriptTokenizer final : public TensorTransform { public: /// \brief Constructor. - /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false). - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); /// \brief Destructor @@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform { class WhitespaceTokenizer final : public TensorTransform { public: /// \brief Constructor. - /// \param[in] with_offsets If or not output offsets of tokens (default=false). + /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). explicit WhitespaceTokenizer(bool with_offsets = false); /// \brief Destructor diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc index 3a7d9e9353..4d72217113 100644 --- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc +++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc @@ -36,6 +36,7 @@ #include "minddata/dataset/text/kernels/to_number_op.h" #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" +#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" #ifndef _WIN32 #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" @@ -396,6 +397,39 @@ std::shared_ptr UnicodeCharTokenizerOperation::Build() { return tensor_op; } +// WordpieceTokenizerOperation +WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr &vocab, + const std::string &suffix_indicator, + int32_t max_bytes_per_token, const std::string &unknown_token, + bool with_offsets) + : vocab_(vocab), + suffix_indicator_(suffix_indicator), + max_bytes_per_token_(max_bytes_per_token), + unknown_token_(unknown_token), + with_offsets_(with_offsets) {} + +Status WordpieceTokenizerOperation::ValidateParams() { + if (vocab_ == nullptr) { + std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + if (max_bytes_per_token_ < 0) { + std::string err_msg = + "WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + + std::to_string(max_bytes_per_token_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + return Status::OK(); +} + +std::shared_ptr WordpieceTokenizerOperation::Build() { + std::shared_ptr tensor_op = std::make_shared( + vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_); + return tensor_op; +} + #ifndef _WIN32 // UnicodeScriptTokenizerOperation UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h index c06970f644..32b54599ab 100644 --- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h +++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h @@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; +constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer"; /* ####################################### Derived TensorOperation classes ################################# */ @@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation { bool with_offsets_; }; +class WordpieceTokenizerOperation : public TensorOperation { + public: + explicit WordpieceTokenizerOperation(const std::shared_ptr &vocab, const std::string &suffix_indicator, + int32_t max_bytes_per_token, const std::string &unknown_token, + bool with_offsets); + + ~WordpieceTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kWordpieceTokenizerOperation; } + + private: + std::shared_ptr vocab_; + std::string suffix_indicator_; + int32_t max_bytes_per_token_; + std::string unknown_token_; + bool with_offsets_; +}; + #ifndef _WIN32 class UnicodeScriptTokenizerOperation : public TensorOperation { public: diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 259aa19be1..0e7a1ebe0a 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -2207,7 +2207,7 @@ def _pyfunc_worker_init(pyfunc_list): # All exceptions will be raised to main processes def _pyfunc_worker_exec(index, *args): """ - Internal function for call certain pyfunc in python process. + Internal function for call certain pyfunc in Python process. """ # Some threads in multiprocess.pool can't process sigint signal, # and will occur hang problem, so ctrl+c will pass to parent process. @@ -2352,7 +2352,7 @@ class MapDataset(Dataset): # Pass #1, look for Python callables and build list for op in self.operations: - # our c transforms is now callable and should not be run in python multithreading + # our c transforms is now callable and should not be run in Python multithreading if callable(op) and str(op).find("c_transform") < 0: callable_list.append(op) @@ -2373,7 +2373,7 @@ class MapDataset(Dataset): with _LOCK: _OP_PROCESS.update(process_id) for op in self.operations: - # our c transforms is now callable and should not be run in python multithreading + # our c transforms is now callable and should not be run in Python multithreading if callable(op) and str(op).find("c_transform") < 0: # Wrap Python callable into _PythonCallable iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool)) diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index 8c1621495e..5e263e92c0 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -610,7 +610,7 @@ class SubsetSampler(BuiltinSampler): Samples the elements from a sequence of indices. Args: - indices (Any iterable python object but string): A sequence of indices. + indices (Any iterable Python object but string): A sequence of indices. num_samples (int, optional): Number of elements to sample (default=None, all elements). Examples: diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index b0c2fb8565..348d87db7a 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -102,7 +102,7 @@ class JiebaTokenizer(TextTensorOperation): - JiebaMode.MP, tokenize with MPSegment algorithm. - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> from mindspore.dataset.text import JiebaMode @@ -186,6 +186,9 @@ class JiebaTokenizer(TextTensorOperation): word2 None word3 freq3 + Only valid word-freq pairs in user provided file will be added into the dictionary. + Rows containing invalid input will be ignored. No error nor warning Status is returned. + Examples: >>> from mindspore.dataset.text import JiebaMode >>> jieba_hmm_file = "/path/to/jieba/hmm/file" @@ -221,16 +224,16 @@ class JiebaTokenizer(TextTensorOperation): "user dict file {} is not exist.".format(file_path)) real_file_path = os.path.realpath(file_path) file_dict = open(real_file_path) - data_re = re.compile('^(.+?)( [0-9]+)?$', re.U) + data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U) words_list = [] for item in file_dict: data = item.strip() if not isinstance(data, str): data = self.__decode(data) - words = data_re.match(data).groups() - if len(words) != 2: - raise ValueError( - "user dict file {} format error.".format(real_file_path)) + tmp = data_re.match(data) + if not tmp: + continue + words = tmp.groups() words_list.append(words) file_dict.close() return words_list @@ -452,7 +455,7 @@ class UnicodeCharTokenizer(TextTensorOperation): Tokenize a scalar tensor of UTF-8 string to Unicode characters. Args: - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> # If with_offsets=False, default output one column {["text", dtype=str]} @@ -474,8 +477,7 @@ class UnicodeCharTokenizer(TextTensorOperation): return cde.UnicodeCharTokenizerOperation(self.with_offsets) -# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ -class WordpieceTokenizer(cde.WordpieceTokenizerOp): +class WordpieceTokenizer(TextTensorOperation): """ Tokenize scalar token or 1-D tokens to 1-D subword tokens. @@ -485,7 +487,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, return the token directly, else return 'unknown_token' (default='[UNK]'). - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] @@ -511,8 +513,10 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): self.max_bytes_per_token = max_bytes_per_token self.unknown_token = unknown_token self.with_offsets = with_offsets - super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, - self.unknown_token, self.with_offsets) + + def parse(self): + return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, + self.unknown_token, self.with_offsets) class PythonTokenizer: @@ -572,7 +576,7 @@ if platform.system().lower() != 'windows': only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). preserve_unused_token (bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> from mindspore.dataset.text import NormalizeForm @@ -638,7 +642,7 @@ if platform.system().lower() != 'windows': only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). preserve_unused_token (bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> from mindspore.dataset.text import NormalizeForm @@ -793,7 +797,7 @@ if platform.system().lower() != 'windows': keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('') which means that delimiters will not be kept as an output token (default=''). - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> # If with_offsets=False, default output one column {["text", dtype=str]} @@ -829,8 +833,8 @@ if platform.system().lower() != 'windows': UnicodeScriptTokenizer is not supported on Windows platform yet. Args: - keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> # If with_offsets=False, default output one column {["text", dtype=str]} @@ -865,7 +869,7 @@ if platform.system().lower() != 'windows': WhitespaceTokenizer is not supported on Windows platform yet. Args: - with_offsets (bool, optional): If or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). Examples: >>> # If with_offsets=False, default output one column {["text", dtype=str]} diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 13414cbfdb..23c4ea73ee 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -1048,7 +1048,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { EXPECT_NE(jieba_tokenizer, nullptr); // Add word with freq not provided (default 0) - jieba_tokenizer->AddWord("男默女泪"); + ASSERT_OK(jieba_tokenizer->AddWord("男默女泪")); // Create Map operation on ds ds = ds->Map({jieba_tokenizer}, {"text"}); @@ -1100,7 +1100,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { EXPECT_NE(jieba_tokenizer, nullptr); // Add word with freq is set explicitly to 0 - jieba_tokenizer->AddWord("男默女泪", 0); + ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0)); // Create Map operation on ds ds = ds->Map({jieba_tokenizer}, {"text"}); @@ -1152,7 +1152,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { EXPECT_NE(jieba_tokenizer, nullptr); // Add word with freq 10 - jieba_tokenizer->AddWord("男默女泪", 10); + ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10)); // Create Map operation on ds ds = ds->Map({jieba_tokenizer}, {"text"}); @@ -1204,7 +1204,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { EXPECT_NE(jieba_tokenizer, nullptr); // Add word with freq 20000 - jieba_tokenizer->AddWord("江大桥", 20000); + ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000)); // Create Map operation on ds ds = ds->Map({jieba_tokenizer}, {"text"}); @@ -1262,6 +1262,115 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); } +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) { + // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + std::make_shared(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Add word with freq 20000 + std::vector> user_dict = {{"江大桥", 20000}}; + ASSERT_OK(jieba_tokenizer->AddDict(user_dict)); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) { + // Testing AddDict of JiebaTokenizer when the input is a path to dict. + // Test error scenario for AddDict: invalid path + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + std::make_shared(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Load dict from txt file + std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt"; + std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt"; + EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path)); + ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path)); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { // Testing the parameter of SlidingWindow interface when the axis is 0. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; @@ -2662,6 +2771,421 @@ TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { iter->Stop(); } +std::vector vocab_english = {"book", "cholera", "era", "favor", "##ite", "my", + "is", "love", "dur", "##ing", "the"}; + +std::vector vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"}; + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1."; + // Test WordpieceTokenizer with default parameters on English vocab + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(10); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = std::make_shared(vocab); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 10); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2."; + // Test WordpieceTokenizer with empty unknown_token + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(10); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = + std::make_shared(vocab, "##", 100, "", false); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 10); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3."; + // Test WordpieceTokenizer with non-default max_bytes_per_token + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(10); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = + std::make_shared(vocab, "##", 4, "[UNK]", false); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"}, + {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 10); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4."; + // Test WordpieceTokenizer with default parameters on Chinese vocab + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Skip operation on ds + ds = ds->Skip(10); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(15); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = + std::make_shared(vocab, "##", 100, "[UNK]", false); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"}, + {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 15); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5."; + // Test WordpieceTokenizer with with_offsets true + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(10); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = + std::make_shared(vocab, "##", 100, "[UNK]", true); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; + std::vector> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}}; + std::vector> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["token"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + + auto start = row["offsets_start"]; + std::shared_ptr de_expected_start_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor)); + mindspore::MSTensor expected_start_tensor = + mindspore::MSTensor(std::make_shared(de_expected_start_tensor)); + EXPECT_MSTENSOR_EQ(start, expected_start_tensor); + + auto limit = row["offsets_limit"]; + std::shared_ptr de_expected_limit_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor)); + mindspore::MSTensor expected_limit_tensor = + mindspore::MSTensor(std::make_shared(de_expected_limit_tensor)); + EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 10); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6."; + // Test WordpieceTokenizer with max_bytes_per_token equals to 0 + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create Take operation on ds + ds = ds->Take(10); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = + std::make_shared(vocab, "##", 0, "[UNK]", true); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + iter->GetNextRow(&row); + + std::vector> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, + {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["token"]; + std::shared_ptr de_expected_tensor; + ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); + mindspore::MSTensor expected_tensor = + mindspore::MSTensor(std::make_shared(de_expected_tensor)); + EXPECT_MSTENSOR_EQ(txt, expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 10); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1."; + // Test WordpieceTokenizer with nullptr vocab + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = std::make_shared(nullptr); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create a Map operation on ds + ds = ds->Map({wordpiece_tokenizer}); + EXPECT_NE(ds, nullptr); + + std::shared_ptr iter = ds->CreateIterator(); + // Expect failure: invalid WordpieceTokenizer input with nullptr vocab + EXPECT_EQ(iter, nullptr); +} + +TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2."; + // Test WordpieceTokenizer with negative max_bytes_per_token + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create WordpieceTokenizer operation on ds + std::shared_ptr wordpiece_tokenizer = std::make_shared(vocab, "##", -1); + EXPECT_NE(wordpiece_tokenizer, nullptr); + + // Create a Map operation on ds + ds = ds->Map({wordpiece_tokenizer}); + EXPECT_NE(ds, nullptr); + + std::shared_ptr iter = ds->CreateIterator(); + // Expect failure: invalid WordpieceTokenizer input with nullptr vocab + EXPECT_EQ(iter, nullptr); +} + TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";