!13364 Add WordpieceTokenizer and AddDict to Jieba

From: @alexyuyue Reviewed-by: Signed-off-by:
4 years ago · 0cfd8b7c9e
parent 98307c10db 2d6d608fae
commit 0cfd8b7c9e
10 changed files with 785 additions and 43 deletions
--- a/include/api/dual_abi_helper.h
+++ b/include/api/dual_abi_helper.h
@ -134,6 +134,15 @@ inline std::vector<std::pair<std::string, std::vector<int32_t>>> ClassIndexCharT
  return ret;
 }
 inline std::vector<std::pair<std::vector<char>, int64_t>> PairStringInt64ToPairCharInt64(
  const std::vector<std::pair<std::string, int64_t>> &s) {
  std::vector<std::pair<std::vector<char>, int64_t>> ret;
  std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) {
    return std::pair<std::vector<char>, int64_t>(std::vector<char>(str.first.begin(), str.first.end()), str.second);
  });
  return ret;
 }
 template <class T>
 inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) {
  std::map<std::vector<char>, T> ret;
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) {
                    }));
                }));
-// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
+PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) {
-PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
+                  (void)py::class_<text::WordpieceTokenizerOperation, TensorOperation,
-                  (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
+                                   std::shared_ptr<text::WordpieceTokenizerOperation>>(*m,
-                    *m, "WordpieceTokenizerOp")
+                                                                                       "WordpieceTokenizerOperation")
-                    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
+                    .def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
-                                  const bool &>());
+                                     int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) {
                      auto wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizerOperation>(
                        vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets);
                      THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams());
                      return wordpiece_tokenizer;
                    }));
                }));
 PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -15,6 +15,8 @@
 */
 #include <unistd.h>
 #include <fstream>
 #include <regex>
 #include "minddata/dataset/include/text.h"
@ -131,7 +133,7 @@ std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
  return jieba_tokenizer;
 }
-Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
+Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
  if (word.empty()) {
    std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
    MS_LOG(ERROR) << err_msg;
@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }
-  data_->words_list_.emplace_back(word, freq);
+  data_->words_list_.emplace_back(CharToString(word), freq);
  return Status::OK();
 }
 Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
  for (auto &word_freq_pair : user_dict) {
    RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
  }
  return Status::OK();
 }
 Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
  std::vector<std::pair<std::string, int64_t>> user_dict;
  RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
  RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
  return Status::OK();
 }
 Status JiebaTokenizer::ParserFile(const std::string &file_path,
                                  std::vector<std::pair<std::string, int64_t>> *const user_dict) {
  std::ifstream ifs(file_path);
  if (!ifs) {
    std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }
  std::string line;
  while (std::getline(ifs, line)) {
    if (line.empty()) {
      continue;
    }
    std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
    std::smatch tokens;
    std::regex_match(line, tokens, regex);
    if (std::regex_match(line, tokens, regex)) {
      if (tokens.size() == 2) {
        user_dict->emplace_back(tokens.str(1), 0);
      } else if (tokens.size() == 3) {
        user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0));
      } else {
        continue;
      }
    } else {
      continue;
    }
  }
  MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
  MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
  for (std::size_t i = 0; i != user_dict->size(); ++i) {
    if (i >= 10) break;
    MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
  }
  return Status::OK();
 }
@ -310,6 +364,32 @@ std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
  return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
 }
 // WordpieceTokenizer
 struct WordpieceTokenizer::Data {
  Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
       const std::vector<char> &unknown_token, bool with_offsets)
      : vocab_(vocab),
        suffix_indicator_(CharToString(suffix_indicator)),
        max_bytes_per_token_(max_bytes_per_token),
        unknown_token_(CharToString(unknown_token)),
        with_offsets_(with_offsets) {}
  std::shared_ptr<Vocab> vocab_;
  std::string suffix_indicator_;
  int32_t max_bytes_per_token_;
  std::string unknown_token_;
  bool with_offsets_;
 };
 WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
                                       int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
                                       bool with_offsets)
    : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
 std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
  return std::make_shared<WordpieceTokenizerOperation>(
    data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
 }
 #ifndef _WIN32
 // UnicodeScriptTokenizer
 struct UnicodeScriptTokenizer::Data {
--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform {
  ///   false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  ///   '[MASK]' (default=true).
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
                          const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
                          bool with_offsets = false);
@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform {
  ///   false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  ///   '[MASK]' (default=true).
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
                         int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
                         bool lower_case = false, bool keep_whitespace = false,
@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform {
  ///   - JiebaMode.kMP, tokenize with MPSegment algorithm.
  ///   - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
  ///   - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
                          const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false)
      : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform {
  /// \brief Destructor
  ~JiebaTokenizer() = default;
-  Status AddWord(const std::string &word, int64_t freq = 0);
+  /// \brief Add user defined word to JiebaTokenizer's dictionary.
  /// \param[in] word The word to be added to the JiebaTokenizer instance.
  ///   The added word will not be written into the built-in dictionary on disk.
  /// \param[in] freq The frequency of the word to be added. The higher the frequency,
  ///   the better chance the word will be tokenized (default=None, use default frequency).
  Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
  /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary.
  /// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary.
  Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
    return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
  }
  /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file.
  ///   Only valid word-freq pairs in user provided file will be added into the dictionary.
  ///   Rows containing invalid input will be ignored, no error nor warning Status is returned.
  /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
  Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
 protected:
  /// \brief Function to convert TensorTransform object into a TensorOperation object.
@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform {
  std::shared_ptr<TensorOperation> Parse() override;
 private:
  /// \brief Parser user defined word by file.
  /// \param[in] file_path Path to the user defined file.
  /// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file.
  Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
  /// \brief Used to translate all API string to vector of char and back
  Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
  /// \brief Used to translate all API string to vector of char and back
  Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
  /// \brief Used to translate all API string to vector of char and back
  Status AddDictChar(const std::vector<char> &file_path);
  struct Data;
  std::shared_ptr<Data> data_;
 };
@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform {
  /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
  ///   matched by 'keep_delim_pattern'. The default value is an empty string ("")
  ///   which means that delimiters will not be kept as an output token (default="").
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false)
      : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform {
 class UnicodeCharTokenizer final : public TensorTransform {
 public:
  /// \brief Constructor.
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit UnicodeCharTokenizer(bool with_offsets = false);
  /// \brief Destructor
@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform {
  std::shared_ptr<Data> data_;
 };
 /// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens.
 class WordpieceTokenizer final : public TensorTransform {
 public:
  /// \brief Constructor.
  /// \param[in] vocab A Vocab object.
  /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  ///   string, else return the string specified (default='[UNK]').
  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
                              int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
                              bool with_offsets = false)
      : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
                           with_offsets) {}
  explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
                              int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
  /// \brief Destructor
  ~WordpieceTokenizer() = default;
 protected:
  /// \brief Function to convert TensorTransform object into a TensorOperation object.
  /// \return Shared pointer to TensorOperation object.
  std::shared_ptr<TensorOperation> Parse() override;
 private:
  struct Data;
  std::shared_ptr<Data> data_;
 };
 #ifndef _WIN32
 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
 class UnicodeScriptTokenizer final : public TensorTransform {
 public:
  /// \brief Constructor.
-  /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
+  /// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false).
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
  /// \brief Destructor
@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform {
 class WhitespaceTokenizer final : public TensorTransform {
 public:
  /// \brief Constructor.
-  /// \param[in] with_offsets If or not output offsets of tokens (default=false).
+  /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  explicit WhitespaceTokenizer(bool with_offsets = false);
  /// \brief Destructor
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@ -36,6 +36,7 @@
 #include "minddata/dataset/text/kernels/to_number_op.h"
 #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
 #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
 #ifndef _WIN32
 #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
@ -396,6 +397,39 @@ std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
  return tensor_op;
 }
 // WordpieceTokenizerOperation
 WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab,
                                                         const std::string &suffix_indicator,
                                                         int32_t max_bytes_per_token, const std::string &unknown_token,
                                                         bool with_offsets)
    : vocab_(vocab),
      suffix_indicator_(suffix_indicator),
      max_bytes_per_token_(max_bytes_per_token),
      unknown_token_(unknown_token),
      with_offsets_(with_offsets) {}
 Status WordpieceTokenizerOperation::ValidateParams() {
  if (vocab_ == nullptr) {
    std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null.";
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }
  if (max_bytes_per_token_ < 0) {
    std::string err_msg =
      "WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
      std::to_string(max_bytes_per_token_);
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }
  return Status::OK();
 }
 std::shared_ptr<TensorOp> WordpieceTokenizerOperation::Build() {
  std::shared_ptr<WordpieceTokenizerOp> tensor_op = std::make_shared<WordpieceTokenizerOp>(
    vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_);
  return tensor_op;
 }
 #ifndef _WIN32
 // UnicodeScriptTokenizerOperation
 UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
 constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
 constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
 constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
 constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer";
 /* ####################################### Derived TensorOperation classes ################################# */
@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation {
  bool with_offsets_;
 };
 class WordpieceTokenizerOperation : public TensorOperation {
 public:
  explicit WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
                                       int32_t max_bytes_per_token, const std::string &unknown_token,
                                       bool with_offsets);
  ~WordpieceTokenizerOperation() = default;
  std::shared_ptr<TensorOp> Build() override;
  Status ValidateParams() override;
  std::string Name() const override { return kWordpieceTokenizerOperation; }
 private:
  std::shared_ptr<Vocab> vocab_;
  std::string suffix_indicator_;
  int32_t max_bytes_per_token_;
  std::string unknown_token_;
  bool with_offsets_;
 };
 #ifndef _WIN32
 class UnicodeScriptTokenizerOperation : public TensorOperation {
 public:
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -2207,7 +2207,7 @@ def _pyfunc_worker_init(pyfunc_list):
 # All exceptions will be raised to main processes
 def _pyfunc_worker_exec(index, *args):
    """
-    Internal function for call certain pyfunc in python process.
+    Internal function for call certain pyfunc in Python process.
    """
    # Some threads in multiprocess.pool can't process sigint signal,
    # and will occur hang problem, so ctrl+c will pass to parent process.
@ -2352,7 +2352,7 @@ class MapDataset(Dataset):
            # Pass #1, look for Python callables and build list
            for op in self.operations:
-                # our c transforms is now callable and should not be run in python multithreading
+                # our c transforms is now callable and should not be run in Python multithreading
                if callable(op) and str(op).find("c_transform") < 0:
                    callable_list.append(op)
@ -2373,7 +2373,7 @@ class MapDataset(Dataset):
                with _LOCK:
                    _OP_PROCESS.update(process_id)
                for op in self.operations:
-                    # our c transforms is now callable and should not be run in python multithreading
+                    # our c transforms is now callable and should not be run in Python multithreading
                    if callable(op) and str(op).find("c_transform") < 0:
                        # Wrap Python callable into _PythonCallable
                        iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@ -610,7 +610,7 @@ class SubsetSampler(BuiltinSampler):
    Samples the elements from a sequence of indices.
    Args:
-        indices (Any iterable python object but string): A sequence of indices.
+        indices (Any iterable Python object but string): A sequence of indices.
        num_samples (int, optional): Number of elements to sample (default=None, all elements).
    Examples:
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -102,7 +102,7 @@ class JiebaTokenizer(TextTensorOperation):
            - JiebaMode.MP, tokenize with MPSegment algorithm.
            - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
            - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
-        with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
    Examples:
        >>> from mindspore.dataset.text import JiebaMode
@ -186,6 +186,9 @@ class JiebaTokenizer(TextTensorOperation):
                    word2 None
                    word3 freq3
                Only valid word-freq pairs in user provided file will be added into the dictionary.
                Rows containing invalid input will be ignored. No error nor warning Status is returned.
        Examples:
            >>> from mindspore.dataset.text import JiebaMode
            >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
@ -221,16 +224,16 @@ class JiebaTokenizer(TextTensorOperation):
                "user dict file {} is not exist.".format(file_path))
        real_file_path = os.path.realpath(file_path)
        file_dict = open(real_file_path)
-        data_re = re.compile('^(.+?)( [0-9]+)?$', re.U)
+        data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U)
        words_list = []
        for item in file_dict:
            data = item.strip()
            if not isinstance(data, str):
                data = self.__decode(data)
-            words = data_re.match(data).groups()
+            tmp = data_re.match(data)
-            if len(words) != 2:
+            if not tmp:
-                raise ValueError(
+                continue
-                    "user dict file {} format error.".format(real_file_path))
+            words = tmp.groups()
            words_list.append(words)
        file_dict.close()
        return words_list
@ -452,7 +455,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
    Tokenize a scalar tensor of UTF-8 string to Unicode characters.
    Args:
-        with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
    Examples:
        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -474,8 +477,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
        return cde.UnicodeCharTokenizerOperation(self.with_offsets)
-# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
+class WordpieceTokenizer(TextTensorOperation):
 class WordpieceTokenizer(cde.WordpieceTokenizerOp):
    """
    Tokenize scalar token or 1-D tokens to 1-D subword tokens.
@ -485,7 +487,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
        unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
            return the token directly, else return 'unknown_token' (default='[UNK]').
-        with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
    Examples:
        >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
@ -511,7 +513,9 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
        self.max_bytes_per_token = max_bytes_per_token
        self.unknown_token = unknown_token
        self.with_offsets = with_offsets
-        super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
+
    def parse(self):
        return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
                                               self.unknown_token, self.with_offsets)
@ -572,7 +576,7 @@ if platform.system().lower() != 'windows':
                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
            preserve_unused_token (bool, optional): If True, do not split special tokens like
                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
-            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
        Examples:
            >>> from mindspore.dataset.text import NormalizeForm
@ -638,7 +642,7 @@ if platform.system().lower() != 'windows':
                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
            preserve_unused_token (bool, optional): If True, do not split special tokens like
                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
-            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
        Examples:
            >>> from mindspore.dataset.text import NormalizeForm
@ -793,7 +797,7 @@ if platform.system().lower() != 'windows':
            keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
                if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
                which means that delimiters will not be kept as an output token (default='').
-            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
        Examples:
            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -829,8 +833,8 @@ if platform.system().lower() != 'windows':
            UnicodeScriptTokenizer is not supported on Windows platform yet.
        Args:
-            keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
+            keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False).
-            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
        Examples:
            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -865,7 +869,7 @@ if platform.system().lower() != 'windows':
            WhitespaceTokenizer is not supported on Windows platform yet.
        Args:
-            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
        Examples:
            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc