Add WhitespaceTokenizer and UnicodeScriptTokenizer for nlp

add CaseFold, NormalizeUTF8 add RegexReplace add RegexTokenizer add BasicTokenizer add WordpieceTokenizer add BertTokenizer
5 years ago · 4f16f036be
parent ea37dc76f0
commit 4f16f036be
45 changed files with 2944 additions and 10 deletions
--- a/581
+++ b/581
--- a/cmake/external_libs/icu4c.cmake
+++ b/cmake/external_libs/icu4c.cmake
@ -0,0 +1,19 @@
+set(LIB_ICU_COMMON icuuc)
+set(LIB_ICU_DATA icudata)
+set(LIB_ICU_I18N icui18n)
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    message("icu4c thirdparty do not support windows currently.")
+else()
+    mindspore_add_pkg(icu4c
+            VER 67.1
+            LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
+            URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
+            MD5 0c2662a2b0bc80b0eb56495205247c8f
+            CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
+            )
+    include_directories(${icu4c_INC})
+    add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
+    add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
+    add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
+    add_definitions(-D ENABLE_ICU4C)
+endif()
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
 endif()

 if (ENABLE_MINDDATA)
+    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
-
+    if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+        message("icu4c does not support windows system temporarily")
+    else()
+        file(GLOB_RECURSE ICU4C_LIB_LIST
+            ${icu4c_LIBPATH}/libicuuc*
+            ${icu4c_LIBPATH}/libicudata*
+            ${icu4c_LIBPATH}/libicui18n*
+        )
+        install(
+            FILES ${ICU4C_LIB_LIST}
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+        )
+    endif()
 endif ()

 if (ENABLE_CPU)
--- a/mindspore/ccsrc/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/CMakeLists.txt
@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
    target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
 else()
+    set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
    target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
 endif()
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
-        mindspore::opencv_imgproc mindspore::tinyxml2)
+        mindspore::opencv_imgproc mindspore::tinyxml2  ${ICU_LIB})
 if (ENABLE_GPUQUE)
    target_link_libraries(_c_dataengine PRIVATE gpu_queue
                                     ${CUDNN_PATH}/lib64/libcudnn.so
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -65,8 +65,21 @@
 #include "dataset/text/kernels/jieba_tokenizer_op.h"
 #include "dataset/text/kernels/ngram_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
 #include "dataset/text/vocab.h"
 #include "dataset/text/kernels/lookup_op.h"
+
+#ifdef ENABLE_ICU4C
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/bert_tokenizer_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
+#endif
+
 #include "dataset/util/random.h"
 #include "mindrecord/include/shard_operator.h"
 #include "mindrecord/include/shard_pk_sample.h"
@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) {
         py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
 }

-void bindTensorOps5(py::module *m) {
+void bindTokenizerOps(py::module *m) {
  (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
    .def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
         py::arg("mode") = JiebaMode::kMix)
@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) {
                  const std::string &>(),
         py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
         py::arg("separator"));
+  (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
+    *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
+}
+
+void bindDependIcuTokenizerOps(py::module *m) {
+#ifdef ENABLE_ICU4C
+  (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
+    *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
+    .def(py::init<>());
+  (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
+    *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
+    .def(py::init<>())
+    .def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
+  (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
+    *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
+    .def(py::init<>());
+  (void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
+    *m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
+    .def(py::init<>())
+    .def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
+  (void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
+    *m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
+    .def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
+         py::arg("replace_all"));
+  (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
+    *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
+    .def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
+  (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
+    *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
+    .def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+  (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
+                                                                                "Tokenizer used for Bert text process.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
+                  NormalizeForm, bool>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
+         py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+#endif
 }

 void bindSamplerOps(py::module *m) {
@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("DE_JIEBA_HMM", JiebaMode::kHmm)
    .export_values();

+#ifdef ENABLE_ICU4C
+  (void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
+    .value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
+    .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
+    .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
+    .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
+    .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
+    .export_values();
+#endif
+
  (void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
    .value("DE_INTER_LINEAR", InterpolationMode::kLinear)
    .value("DE_INTER_CUBIC", InterpolationMode::kCubic)
@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
  bindTensorOps2(&m);
  bindTensorOps3(&m);
  bindTensorOps4(&m);
-  bindTensorOps5(&m);
+  bindTokenizerOps(&m);
  bindSamplerOps(&m);
  bindDatasetOps(&m);
  bindInfoObjects(&m);
  bindVocabObjects(&m);
  bindGraphData(&m);
+  bindDependIcuTokenizerOps(&m);
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
@ -1,8 +1,21 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
+        set(ICU_DEPEND_FILES
+                basic_tokenizer_op.cc
+                bert_tokenizer_op.cc
+                case_fold_op.cc
+                normalize_utf8_op.cc
+                regex_replace_op.cc
+                regex_tokenizer_op.cc
+                unicode_script_tokenizer_op.cc
+                whitespace_tokenizer_op.cc)
+endif()
 add_library(text-kernels OBJECT
        lookup_op.cc
        jieba_tokenizer_op.cc
        unicode_char_tokenizer_op.cc
        ngram_op.cc
+        wordpiece_tokenizer_op.cc
+        ${ICU_DEPEND_FILES}
        )
--- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+const bool BasicTokenizerOp::kDefLowerCase = false;
+const bool BasicTokenizerOp::kDefKeepWhitespace = false;
+const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
+const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
+const char BasicTokenizerOp::kCommonPattern[] =
+  "[!-/]"
+  "|[:-@]"
+  "|[\\[-`]"
+  "|[{-~]"
+  "|[\\p{P}]"
+  "|[\\x{4E00}-\\x{9FFF}]"
+  "|[\\x{3400}-\\x{4DBF}]"
+  "|[\\x{20000}-\\x{2A6DF}]"
+  "|[\\x{2A700}-\\x{2B73F}]"
+  "|[\\x{2B740}-\\x{2B81F}]"
+  "|[\\x{2B820}-\\x{2CEAF}]"
+  "|[\\x{F900}-\\x{FAFF}]"
+  "|[\\x{2F800}-\\x{2FA1F}]";
+const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
+
+BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
+                                   bool preserve_unused_token)
+    : lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      preserve_unused_token_(preserve_unused_token),
+      case_fold_(std::make_unique<CaseFoldOp>()),
+      nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
+      common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
+      replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
+      replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
+  std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
+  std::string keep_delim_pattern;
+  if (keep_whitespace_) {
+    keep_delim_pattern = delim_pattern;
+  } else {
+    keep_delim_pattern = kCommonPattern;
+  }
+  if (preserve_unused_token_) {
+    keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
+    delim_pattern = kUnusedPattern + delim_pattern;
+  }
+  regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
+}
+
+Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::shared_ptr<Tensor> cur_input;
+  std::shared_ptr<Tensor> processed_tensor;
+  if (lower_case_) {
+    // to lower case
+    RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
+    cur_input = processed_tensor;
+    // strip accent characters
+    RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
+    cur_input = processed_tensor;
+    RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
+  } else {
+    RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
+  }
+  // strip control characters
+  cur_input = processed_tensor;
+  RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
+  return regex_tokenizer_->Compute(processed_tensor, output);
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class BasicTokenizerOp : public TensorOp {
+ public:
+  static const bool kDefLowerCase;
+  static const bool kDefKeepWhitespace;
+  static const NormalizeForm kDefNormalizationForm;
+  static const bool kDefPreserveUnusedToken;
+  BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
+                   NormalizeForm normalization_form = kDefNormalizationForm,
+                   bool preserve_unused_token = kDefPreserveUnusedToken);
+
+  ~BasicTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  static const char kCommonPattern[];
+  static const char kUnusedPattern[];
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalization_form_;
+  bool preserve_unused_token_;
+  std::unique_ptr<CaseFoldOp> case_fold_;
+  std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
+  std::unique_ptr<NormalizeUTF8Op> common_normalize_;
+  std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
+  std::unique_ptr<RegexReplaceOp> replace_control_chars_;
+  std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/bert_tokenizer_op.h"
+namespace mindspore {
+namespace dataset {
+Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  std::shared_ptr<Tensor> basic_tensor;
+  RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
+  RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class BertTokenizerOp : public TensorOp {
+ public:
+  BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
+                  const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
+                  const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+                  const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
+                  bool lower_case = BasicTokenizerOp::kDefLowerCase,
+                  bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
+                  NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
+                  bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
+      : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
+        basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
+
+  ~BertTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  WordpieceTokenizerOp wordpiece_tokenizer_;
+  BasicTokenizerOp basic_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/case_fold_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
+  CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class CaseFoldOp : public TensorOp {
+ public:
+  CaseFoldOp() {}
+
+  ~CaseFoldOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
 }

 Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  RETURN_UNEXPECTED_IF_NULL(jieba_parser_);

  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
    : vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}

 Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  RETURN_UNEXPECTED_IF_NULL(vocab_);
  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
  std::vector<WordIdType> word_ids;
--- a/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_le
      separator_(separator) {}

 Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
  std::vector<int32_t> offsets;                 // offsets for each str
  std::vector<std::string> res;                 // holds the result of ngrams
--- a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
+Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *normalize = nullptr;
+  switch (normalize_form_) {
+    case NormalizeForm::kNone: {
+      *output = input;
+      return Status::OK();
+    }
+    case NormalizeForm::kNfc: {
+      normalize = icu::Normalizer2::getNFCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkc: {
+      normalize = icu::Normalizer2::getNFKCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfd: {
+      normalize = icu::Normalizer2::getNFDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkd: {
+      normalize = icu::Normalizer2::getNFKDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
+      break;
+    }
+    default: {
+      RETURN_STATUS_UNEXPECTED("unexpected normalize form");
+      break;
+    }
+  }
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+enum class NormalizeForm {
+  kNone = 0,
+  kNfc,
+  kNfkc,
+  kNfd,
+  kNfkd,
+};
+
+class NormalizeUTF8Op : public TensorOp {
+ public:
+  static const NormalizeForm kDefNormalizeForm;
+  explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
+
+  ~NormalizeUTF8Op() override = default;
+
+  void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  NormalizeForm normalize_form_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
@ -0,0 +1,57 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_replace_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+
+Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
+                                    std::string *out) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
+  matcher->reset(unicode_text);
+  icu::UnicodeString unicode_out;
+  if (replace_all_) {
+    unicode_out = matcher->replaceAll(replace_, icu_error);
+  } else {
+    unicode_out = matcher->replaceFirst(replace_, icu_error);
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
+  unicode_out.toUTF8String(*out);
+  return Status::OK();
+}
+
+Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::RegexMatcher matcher(pattern_, 0, icu_error);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#include <memory>
+#include <string>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexReplaceOp : public TensorOp {
+ public:
+  RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
+      : pattern_(icu::UnicodeString::fromUTF8(pattern)),
+        replace_(icu::UnicodeString::fromUTF8(replace)),
+        replace_all_(replace_all) {}
+
+  ~RegexReplaceOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
+
+ private:
+  const icu::UnicodeString pattern_;
+  const icu::UnicodeString replace_;
+  const bool replace_all_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
@ -0,0 +1,103 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                                          icu::UnicodeString *out_unicode) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
+  int total_len = input.length();
+  int end = start + len;
+  CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
+  icu::UnicodeString temp;
+  input.extract(start, len, temp);
+  if (out_utf8 != nullptr) {
+    temp.toUTF8String(*out_utf8);
+  }
+  if (out_unicode != nullptr) {
+    *out_unicode = temp;
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
+  UErrorCode status = U_ZERO_ERROR;
+  out_tokens->clear();
+  icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+  icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+
+  icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
+  token_matcher.reset(utext);
+
+  int token_start_index = 0;
+  status = U_ZERO_ERROR;
+  while (token_matcher.find(status) && U_SUCCESS(status)) {
+    int deli_start_index = token_matcher.start(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+    int deli_end_index = token_matcher.end(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+
+    // Add non-empty token
+    int token_len = deli_start_index - token_start_index;
+    if (token_len > 0) {
+      std::string token;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
+      out_tokens->emplace_back(std::move(token));
+    }
+
+    int delim_len = deli_end_index - deli_start_index;
+    if (keep_delim_ && delim_len > 0) {
+      icu::UnicodeString delim_str;
+      std::string delim_utf8_str;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
+      delim_matcher.reset(delim_str);
+      if (delim_matcher.matches(status) && U_SUCCESS(status)) {
+        out_tokens->emplace_back(std::move(delim_utf8_str));
+      }
+    }
+    token_start_index = deli_end_index;
+  }
+
+  if (token_start_index < utext.length()) {
+    std::string temp;
+    RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
+    out_tokens->emplace_back(std::move(temp));
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view text;
+  RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
+  std::vector<std::string> tokens;
+  RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
+  *output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexTokenizerOp : public TensorOp {
+ public:
+  RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
+      : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
+        keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
+        keep_delim_(!keep_delim_pattern.empty()) {}
+
+  ~RegexTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                          icu::UnicodeString *out_unicode = nullptr) const;
+  Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
+
+ private:
+  const icu::UnicodeString delim_pattern_;
+  const icu::UnicodeString keep_delim_pattern_;
+  const bool keep_delim_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_REGEX_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
@ -28,6 +28,7 @@ namespace mindspore {
 namespace dataset {

 Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
  }
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
-#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
 #include <memory>

 #include "dataset/core/tensor.h"
@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {

 }  // namespace dataset
 }  // namespace mindspore
-#endif  // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#endif  // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+#include "unicode/errorcode.h"
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+
+namespace mindspore {
+namespace dataset {
+
+const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
+
+Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view str;
+  RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
+  RuneStrArray runes;
+  if (!DecodeRunesInString(str.data(), str.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+
+  UScriptCode last_script = USCRIPT_INVALID_CODE;
+  icu::ErrorCode status;
+  int start = 0;
+  int len = 0;
+  std::vector<std::string> splits;
+
+  bool was_space = false;
+  for (size_t i = 0; i < runes.size(); i++) {
+    bool is_space = u_isUWhiteSpace(runes[i].rune);
+    UScriptCode script = uscript_getScript(runes[i].rune, status);
+    if (status.isFailure()) {
+      status.reset();
+      script = USCRIPT_INVALID_CODE;
+    }
+    // 1) Seperate UTF-8 strings of different UScriptCode values
+    //    (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
+    // 2) Seperate whitespace and non-whitespace UTF-8 strings
+    //    (such as: " ." should be split to [" ", "."])
+    if (len > 0 && (script != last_script || is_space != was_space)) {
+      // 3) If keep_whitespace_ is false, all the whitespace characters will be discard
+      if (keep_whitespace_ || !was_space) {
+        std::string temp(str.substr(start, len));
+        splits.emplace_back(std::move(temp));
+      }
+      start = runes[i].offset;
+      len = runes[i].len;
+    } else {
+      len += runes[i].len;
+    }
+    last_script = script;
+    was_space = is_space;
+  }
+
+  if (len > 0 && (keep_whitespace_ || !was_space)) {
+    std::string temp(str.substr(start, len));
+    splits.emplace_back(std::move(temp));
+  }
+  // 4) If the input is empty scalar string, the output will be 1-D empty string.
+  if (splits.empty()) {
+    splits.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/Show More
+++ b/Show More