add sentence piece

5 years ago · 18b519ae0f
parent 8e4c0a9d93
commit 18b519ae0f
34 changed files with 5806 additions and 9 deletions
--- a/cmake/external_libs/sentencepiece.cmake
+++ b/cmake/external_libs/sentencepiece.cmake
@ -0,0 +1,25 @@
+if (WIN32)
+        set(sentencepiece_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 -Wno-unused-result -Wno-stringop-overflow -Wno-format-extra-args -Wno-format")
+        set(sentencepiece_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
+        mindspore_add_pkg(sentencepiece
+                VER 0.1.92
+                LIBS sentencepiece sentencepiece_train
+                URL https://github.com/google/sentencepiece/archive/v0.1.92.tar.gz
+                CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=ON
+                MD5 5dfd2241914b5598a68b2a8542ed8e91
+                )
+else ()
+       set(sentencepiece_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 -Wno-unused-result -Wno-sign-compare")
+       set(sentencepiece_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
+       mindspore_add_pkg(sentencepiece
+                VER 0.1.92
+                LIBS sentencepiece sentencepiece_train
+                URL https://github.com/google/sentencepiece/archive/v0.1.92.tar.gz
+                CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=OFF -DSPM_ENABLE_SHARED=OFF  -DPROTOBUF_INC=${protobuf_INC}
+                MD5 5dfd2241914b5598a68b2a8542ed8e91
+                PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sentencepiece/sentencepiece.patch001
+                )
+endif ()
+include_directories(${sentencepiece_INC})
+add_library(mindspore::sentencepiece ALIAS sentencepiece::sentencepiece)
+add_library(mindspore::sentencepiece_train ALIAS sentencepiece::sentencepiece_train)
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@ -75,6 +75,7 @@ if (ENABLE_MINDDATA)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sqlite.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tinyxml2.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cppjieba.cmake)
+    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sentencepiece.cmake)
 endif()

 include(${CMAKE_SOURCE_DIR}/cmake/external_libs/gtest.cmake)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -40,6 +40,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Windows")
    set(jpeg_turbo_LIBPATH ${jpeg_turbo_LIBPATH}/../bin/)
    set(sqlite_LIBPATH ${sqlite_LIBPATH}/../bin/)
    set(tinyxml2_LIBPATH ${tinyxml2_LIBPATH}/../bin/)
+    set(sentencepiece_LIBPATH ${sentencepiece_LIBPATH}/../bin/)
 else ()
    set(INSTALL_LIB_DIR "lib")
 endif ()
@ -91,6 +92,14 @@ if (ENABLE_MINDDATA)
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
+    file(GLOB_RECURSE SENTENCEPIECE_LIB_LIST
+        ${sentencepiece_LIBPATH}/libsentencepiece*
+    )
+    install(
+    FILES ${SENTENCEPIECE_LIB_LIST}
+        DESTINATION ${INSTALL_LIB_DIR}
+        COMPONENT mindspore
+    )
    if (CMAKE_SYSTEM_NAME MATCHES "Windows")
        message("icu4c does not support windows system temporarily")
    else()
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@ -128,7 +128,7 @@ else()
    target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
 endif()
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
-        mindspore::opencv_imgproc mindspore::tinyxml2  ${ICU_LIB})
+        mindspore::opencv_imgproc mindspore::tinyxml2 mindspore::sentencepiece mindspore::sentencepiece_train ${ICU_LIB})
 if (ENABLE_GPUQUE)
    target_link_libraries(_c_dataengine PRIVATE gpu_queue
                                     ${CUDNN_PATH}/lib64/libcudnn.so
--- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
@ -87,7 +87,8 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {
  {kTextFile, &DEPipeline::ParseTextFileOp},
  {kBuildVocab, &DEPipeline::ParseBuildVocabOp},
  {kClue, &DEPipeline::ParseClueOp},
-  {kEpochCtrl, &DEPipeline::ParseEpochCtrlOp}};
+  {kEpochCtrl, &DEPipeline::ParseEpochCtrlOp},
+  {kSentencePieceVocab, &DEPipeline::ParseBuildSentencePieceVocabOp}};

 DEPipeline::DEPipeline() : iterator_(nullptr) {
  try {
@ -1710,6 +1711,41 @@ Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<Datas
  return Status::OK();
 }

+Status DEPipeline::ParseBuildSentencePieceVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
+                                                  std::shared_ptr<DatasetOp> *bottom) {
+  std::shared_ptr<BuildSentencePieceVocabOp::Builder> builder = std::make_shared<BuildSentencePieceVocabOp::Builder>();
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "vocab_size") {
+        builder->SetVocabSize(ToInt(value));
+      } else if (key == "character_coverage") {
+        (void)builder->SetCharacterCoverage(ToFloat(value));
+      } else if (key == "params") {
+        std::unordered_map<std::string, std::string> params;
+        for (auto param : py::reinterpret_borrow<py::dict>(value)) {
+          std::string param_key = py::reinterpret_borrow<py::str>(param.first);
+          if (param_key == "input" || param_key == "vocab_size" || param_key == "model_prefix" ||
+              param_key == "character_coverage" || param_key == "model_type") {
+            continue;
+          }
+          params[param_key] = py::reinterpret_borrow<py::str>(param.second);
+        }
+        (void)builder->SetParams(params);
+      } else if (key == "vocab") {
+        (void)builder->SetVocab(value.cast<std::shared_ptr<SentencePieceVocab>>());
+      } else if (key == "model_type") {
+        (void)builder->SetModelType(value.cast<SentencePieceModel>());
+      }
+    }
+  }
+  std::shared_ptr<BuildSentencePieceVocabOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *top = op;
+  return Status::OK();
+}
+
 Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
                               std::shared_ptr<DatasetOp> *bottom) {
  std::vector<std::string> files_list;
--- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h
@ -71,7 +71,8 @@ enum OpName {
  kTextFile,
  kBuildVocab,
  kClue,
-  kEpochCtrl
+  kEpochCtrl,
+  kSentencePieceVocab,
 };

 // The C++ binder class that we expose to the python script.
@ -195,6 +196,8 @@ class DEPipeline {
  Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);

  Status StopSend();
+  Status ParseBuildSentencePieceVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
+                                        std::shared_ptr<DatasetOp> *bottom);

  Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);

--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@ -88,7 +88,9 @@
 #include "minddata/dataset/text/kernels/to_number_op.h"
 #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
 #include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/util/random.h"
 #include "minddata/mindrecord/include/shard_distributed_sample.h"
 #include "minddata/mindrecord/include/shard_operator.h"
@ -684,6 +686,15 @@ void bindTokenizerOps(py::module *m) {
  (void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(
    *m, "SlidingWindowOp", "TensorOp to apply sliding window to a 1-D Tensor.")
    .def(py::init<uint32_t, int32_t>(), py::arg("width"), py::arg("axis"));
+  (void)py::class_<SentencePieceTokenizerOp, TensorOp, std::shared_ptr<SentencePieceTokenizerOp>>(
+    *m, "SentencePieceTokenizerOp", "Tokenize scalar token or 1-D tokens to  tokens by sentence piece.")
+    .def(py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>(),
+         py::arg("vocab"), py::arg("load_type") = SPieceTokenizerLoadType::kModel,
+         py::arg("out_type") = SPieceTokenizerOutType::kString)
+    .def(
+      py::init<const std::string &, const std::string &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>(),
+      py::arg("model_path"), py::arg("model_filename"), py::arg("load_type") = SPieceTokenizerLoadType::kFile,
+      py::arg("out_type") = SPieceTokenizerOutType::kString);
 }

 void bindDependIcuTokenizerOps(py::module *m) {
@ -839,6 +850,33 @@ void bindVocabObjects(py::module *m) {
      THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v));
      return v;
    });
+  (void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
+    .def(py::init<>())
+    .def_static("from_file",
+                [](const py::list &paths, const int vocab_size, const float character_coverage,
+                   const SentencePieceModel model_type, const py::dict &params) {
+                  std::shared_ptr<SentencePieceVocab> v;
+                  std::vector<std::string> path_list;
+                  for (auto path : paths) {
+                    path_list.emplace_back(py::str(path));
+                  }
+                  std::unordered_map<std::string, std::string> param_map;
+                  for (auto param : params) {
+                    std::string key = py::reinterpret_borrow<py::str>(param.first);
+                    if (key == "input" || key == "vocab_size" || key == "model_prefix" || key == "character_coverage" ||
+                        key == "model_type") {
+                      continue;
+                    }
+                    param_map[key] = py::reinterpret_borrow<py::str>(param.second);
+                  }
+                  THROW_IF_ERROR(SentencePieceVocab::BuildFromFile(path_list, vocab_size, character_coverage,
+                                                                   model_type, param_map, &v));
+                  return v;
+                })
+    .def_static("save_model",
+                [](const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename) {
+                  THROW_IF_ERROR(SentencePieceVocab::SaveModel(vocab, path, filename));
+                });
 }

 void bindGraphData(py::module *m) {
@ -998,6 +1036,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("CIFAR100", OpName::kCifar100)
    .value("RANDOMDATA", OpName::kRandomData)
    .value("BUILDVOCAB", OpName::kBuildVocab)
+    .value("SENTENCEPIECEVOCAB", OpName::kSentencePieceVocab)
    .value("CELEBA", OpName::kCelebA)
    .value("TEXTFILE", OpName::kTextFile)
    .value("CLUE", OpName::kClue)
@ -1032,6 +1071,24 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("DE_BORDER_REFLECT", BorderType::kReflect)
    .value("DE_BORDER_SYMMETRIC", BorderType::kSymmetric)
    .export_values();
+
+  (void)py::enum_<SentencePieceModel>(m, "SentencePieceModel", py::arithmetic())
+    .value("DE_SENTENCE_PIECE_UNIGRAM", SentencePieceModel::kUnigram)
+    .value("DE_SENTENCE_PIECE_BPE", SentencePieceModel::kBpe)
+    .value("DE_SENTENCE_PIECE_CHAR", SentencePieceModel::kChar)
+    .value("DE_SENTENCE_PIECE_WORD", SentencePieceModel::kWord)
+    .export_values();
+
+  (void)py::enum_<SPieceTokenizerOutType>(m, "SPieceTokenizerOutType", py::arithmetic())
+    .value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString)
+    .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt)
+    .export_values();
+
+  (void)py::enum_<SPieceTokenizerLoadType>(m, "SPieceTokenizerLoadType", py::arithmetic())
+    .value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile)
+    .value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel)
+    .export_values();
+
  bindDEPipeline(&m);
  bindTensor(&m);
  bindTensorOps1(&m);
--- a/mindspore/ccsrc/minddata/dataset/core/client.h
+++ b/mindspore/ccsrc/minddata/dataset/core/client.h
@ -33,6 +33,7 @@
 #include "minddata/dataset/engine/datasetops/filter_op.h"
 #include "minddata/dataset/engine/datasetops/source/generator_op.h"
 #include "minddata/dataset/engine/datasetops/build_vocab_op.h"
+#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
 #endif

 #include "minddata/dataset/engine/datasetops/batch_op.h"
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/CMakeLists.txt
@ -32,6 +32,7 @@ if (ENABLE_PYTHON)
        barrier_op.cc
        filter_op.cc
        build_vocab_op.cc
+        build_sentence_piece_vocab_op.cc
        )
 endif()

--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
@ -0,0 +1,193 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
+
+#include <iomanip>
+#include "minddata/dataset/core/config_manager.h"
+#include "minddata/dataset/engine/opt/pass.h"
+
+namespace mindspore {
+namespace dataset {
+BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab,
+                                                     std::vector<std::string> col_names, uint32_t vocab_size,
+                                                     float character_coverage, SentencePieceModel model_type,
+                                                     const std::unordered_map<std::string, std::string> &params,
+                                                     int32_t op_conn_size)
+    : PipelineOp(op_conn_size),
+      vocab_size_(vocab_size),
+      vocab_(vocab),
+      col_names_(col_names),
+      character_coverage_(character_coverage),
+      model_type_(model_type),
+      params_(params),
+      col_id_(0) {
+  sentence_queue_ = std::make_unique<Queue<TensorRow>>(op_conn_size);
+  read_done_ = false;
+  ret_status_ = Status::OK();
+}
+
+Status BuildSentencePieceVocabOp::operator()() {
+  RETURN_UNEXPECTED_IF_NULL(tree_);
+  RETURN_IF_NOT_OK(sentence_queue_->Register(tree_->AllTasks()));
+  RETURN_IF_NOT_OK(
+    tree_->AllTasks()->CreateAsyncTask("sentenceTask", std::bind(&BuildSentencePieceVocabOp::SentenceThread, this)));
+  TaskManager::FindMe()->Post();
+  child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
+  TensorRow new_row;
+  RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
+
+  bool eoe_warning = false;  // give out warning if receive more than 1 eoe
+  while (child_iterator_->eof_handled() == false) {
+    while (new_row.empty() == false) {
+      RETURN_IF_NOT_OK(sentence_queue_->EmplaceBack(new_row));
+      RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
+    }
+    CHECK_FAIL_RETURN_UNEXPECTED(!eoe_warning, "no op should be after from_dataset (repeat detected)");
+    eoe_warning = true;
+  }
+  // add empty tensorRow for quit
+  TensorRow empty_row = {};
+  sentence_queue_->EmplaceBack(empty_row);
+  return Status::OK();
+}
+
+Status BuildSentencePieceVocabOp::SentenceThread() {
+  TaskManager::FindMe()->Post();
+  if (col_names_.empty() == true) {
+    auto itr = column_name_id_map_.find("text");
+    CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(),
+                                 "'text' column doesn't exist when column name is empty");
+    col_id_ = itr->second;
+  } else {
+    auto itr = column_name_id_map_.find(col_names_[0]);
+    CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(), col_names_[0] + "column doesn't exist");
+    col_id_ = itr->second;
+  }
+  std::unique_ptr<DatasetSentenceIterator> sentence_iter = std::make_unique<DatasetSentenceIterator>(this);
+  std::string model_proto;
+  sentencepiece::util::Status s_status =
+    sentencepiece::SentencePieceTrainer::Train(BuildParams(), sentence_iter.get(), &model_proto);
+  if (!s_status.ok()) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, s_status.message());
+  } else {
+    if (vocab_ == nullptr) {
+      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "sentencepiece vocab ptr must not be nullptr");
+    }
+    vocab_->set_model_proto(model_proto);
+  }
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
+  return Status::OK();
+}
+
+std::unordered_map<std::string, std::string> BuildSentencePieceVocabOp::BuildParams() {
+  std::unordered_map<std::string, std::string> ret_params;
+  ret_params["vocab_size"] = std::to_string(vocab_size_);
+  ret_params["character_coverage"] = std::to_string(character_coverage_);
+  if (model_type_ == SentencePieceModel::kBpe) {
+    ret_params["model_type"] = "BPE";
+  } else if (model_type_ == SentencePieceModel::kChar) {
+    ret_params["model_type"] = "CHAR";
+  } else if (model_type_ == SentencePieceModel::kWord) {
+    ret_params["model_type"] = "WORD";
+  } else {
+    ret_params["model_type"] = "UNIGRAM";
+  }
+  // filter some params that set by function param
+  // filter  model_prefix that must be empty
+  for (auto param : params_) {
+    std::string key = param.first;
+    if (key == "input" || key == "vocab_size" || key == "model_prefix" || key == "character_coverage" ||
+        key == "model_type") {
+      continue;
+    }
+    ret_params[key] = param.second;
+  }
+
+  ret_params["model_prefix"] = "";
+  ret_params["minloglevel"] = "1";
+  return ret_params;
+}
+
+bool BuildSentencePieceVocabOp::Done() { return read_done_; }
+
+void BuildSentencePieceVocabOp::Next(std::string *sentence) {
+  TensorRow new_row;
+  Status s = sentence_queue_->PopFront(&new_row);
+
+  if (s.IsError()) {
+    read_done_ = true;
+    ret_status_ = s;
+    return;
+  }
+  if (new_row.empty() == true) {
+    read_done_ = true;
+    ret_status_ = Status::OK();
+    return;
+  }
+
+  if (new_row[col_id_]->type().IsNumeric() || new_row[col_id_]->Rank() > 1) {
+    ret_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
+                         "for dataset only words on string columns or must bu scalar");
+    read_done_ = true;
+    return;
+  }
+
+  std::string_view sentence_v;
+  new_row[col_id_]->GetItemAt(&sentence_v, {});
+
+  std::string st{sentence_v};
+  *sentence = st;
+  ret_status_ = Status::OK();
+}
+
+// Pre-Visitor accept method for NodePass
+Status BuildSentencePieceVocabOp::PreAccept(NodePass *p, bool *modified) {
+  // Downcast shared pointer then call the pre-visitation
+  return p->PreRunOnNode(shared_from_base<BuildSentencePieceVocabOp>(), modified);
+}
+
+Status BuildSentencePieceVocabOp::Builder::Build(std::shared_ptr<BuildSentencePieceVocabOp> *op) {
+  (*op) = std::make_shared<BuildSentencePieceVocabOp>(builder_vocab_, builder_col_names_, builder_vocab_size_,
+                                                      builder_character_coverage_, builder_model_type_, builder_params_,
+                                                      builder_connector_size_);
+  return Status::OK();
+}
+
+BuildSentencePieceVocabOp::Builder::Builder() {
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  builder_connector_size_ = cfg->op_connector_size();
+}
+
+BuildSentencePieceVocabOp::DatasetSentenceIterator::DatasetSentenceIterator(BuildSentencePieceVocabOp *s_p_vocab_ptr)
+    : s_p_vocab_ptr_(s_p_vocab_ptr) {}
+
+bool BuildSentencePieceVocabOp::DatasetSentenceIterator::done() const {
+  if (s_p_vocab_ptr_ == nullptr) {
+    return true;
+  }
+  return s_p_vocab_ptr_->Done();
+}
+
+void BuildSentencePieceVocabOp::DatasetSentenceIterator::Next() {
+  if (s_p_vocab_ptr_ == nullptr) {
+    return;
+  }
+  s_p_vocab_ptr_->Next(&value_);
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
@ -0,0 +1,186 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
+#define DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
+
+#include <sentencepiece_trainer.h>
+#include <sentencepiece_processor.h>
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include <string>
+#include <utility>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/engine/dataset_iterator.h"
+#include "minddata/dataset/engine/datasetops/pipeline_op.h"
+#include "minddata/dataset/util/status.h"
+#include "minddata/dataset/util/queue.h"
+#include "minddata/dataset/text/sentence_piece_vocab.h"
+
+namespace mindspore {
+namespace dataset {
+namespace py = pybind11;
+
+class BuildSentencePieceVocabOp : public PipelineOp {
+ public:
+  class Builder {
+   public:
+    Builder();
+
+    // Destructor.
+    ~Builder() = default;
+
+    // Setter method
+    // @param uint32_t size
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetOpConnectorSize(uint32_t size) {
+      builder_connector_size_ = size;
+      return *this;
+    }
+
+    // Setter method
+    // @param uint32_t size
+    // @return Builder & reference to builder class object
+    Builder &SetVocabSize(uint32_t size) {
+      builder_vocab_size_ = size;
+      return *this;
+    }
+
+    // Setter method
+    // @param float charactor corverage - to determine the minimum symbols
+    // @return Builder & reference to builder class object
+    Builder &SetCharacterCoverage(float character_coverage) {
+      builder_character_coverage_ = character_coverage;
+      return *this;
+    }
+
+    // Setter method
+    // @param SentencePieceModel model_type - model algorithm
+    // @return Builder & reference to builder class object
+    Builder &SetModelType(SentencePieceModel model_type) {
+      builder_model_type_ = model_type;
+      return *this;
+    }
+
+    // Setter method
+    // @param std::unordered_map<std::string, std::string> params
+    // @return Builder & reference to builder class object
+    Builder &SetParams(std::unordered_map<std::string, std::string> params) {
+      builder_params_ = params;
+      return *this;
+    }
+
+    // Setter method
+    // @param std::shared_ptr<SentencePieceVocab> vocab
+    // @return Builder & reference to builder class object
+    Builder &SetVocab(std::shared_ptr<SentencePieceVocab> vocab) {
+      builder_vocab_ = vocab;
+      return *this;
+    }
+
+    // set columns names
+    // @param const std::vector<std::string> & col_names - name of columns to get words
+    // @return Builder & reference to builder class object
+    Builder &SetColumnNames(const std::vector<std::string> &col_names) {
+      builder_col_names_ = col_names;
+      return *this;
+    }
+
+    // The builder "build" method creates the final object.
+    // @param std::shared_ptr<BuildVocabOp> *op - DatasetOp
+    // @return - The error code return
+    Status Build(std::shared_ptr<BuildSentencePieceVocabOp> *op);
+
+   private:
+    uint32_t builder_connector_size_;
+    uint32_t builder_vocab_size_;
+    float builder_character_coverage_;
+    SentencePieceModel builder_model_type_;
+    std::unordered_map<std::string, std::string> builder_params_;
+    std::vector<std::string> builder_col_names_;
+    std::shared_ptr<SentencePieceVocab> builder_vocab_;
+  };
+
+ public:
+  class DatasetSentenceIterator : public sentencepiece::SentenceIterator {
+   public:
+    explicit DatasetSentenceIterator(BuildSentencePieceVocabOp *s_p_vocab_ptr);
+    ~DatasetSentenceIterator() {}
+
+    bool done() const override;
+    void Next() override;
+    const std::string &value() const override { return value_; }
+    sentencepiece::util::Status status() const override { return sentencepiece::util::Status(); }
+
+   private:
+    std::string value_;
+    BuildSentencePieceVocabOp *s_p_vocab_ptr_;
+  };
+
+  BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
+                            uint32_t vocab_size, float character_coverage, SentencePieceModel model_type,
+                            const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);
+
+  ~BuildSentencePieceVocabOp() = default;
+
+  // the thread for sentence train
+  Status SentenceThread();
+
+  Status EofReceived(int32_t) override { return Status::OK(); }
+
+  Status EoeReceived(int32_t) override { return Status::OK(); }
+
+  Status operator()() override;
+
+  // Getter
+  // @return the number of workers
+  int32_t num_producers() const override { return 1; }
+
+  // Getter
+  // @return the number of threads consuming from the previous Connector
+  int32_t num_consumers() const override { return 1; }
+
+  Status Reset() override { RETURN_STATUS_UNEXPECTED("Reset shouldn't be called in BuildSentencePieceVocabOp"); }
+
+  // build the input params for sentence api
+  std::unordered_map<std::string, std::string> BuildParams();
+
+  bool Done();
+  void Next(std::string *sentence);
+
+  /// \param[in] p The node to visit
+  /// \param[out] modified Indicator if the node was modified
+  /// \return Status of the node visit
+  Status PreAccept(NodePass *p, bool *modified) override;
+
+ private:
+  bool read_done_;
+  Status ret_status_;
+  uint32_t vocab_size_;
+  float character_coverage_;
+  SentencePieceModel model_type_;
+  std::unordered_map<std::string, std::string> params_;
+  std::shared_ptr<SentencePieceVocab> vocab_;
+  std::vector<std::string> col_names_;
+  uint32_t col_id_;
+  std::unique_ptr<ChildIterator> child_iterator_;     // child iterator for fetching TensorRows 1 by 1
+  std::unique_ptr<Queue<TensorRow>> sentence_queue_;  // master thread assigns each worker TensorRow via this
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc
@ -17,6 +17,7 @@
 #include "minddata/dataset/engine/opt/pass.h"
 #include "minddata/dataset/engine/datasetops/batch_op.h"
 #include "minddata/dataset/engine/datasetops/build_vocab_op.h"
+#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
 #include "minddata/dataset/engine/datasetops/cache_op.h"
 #include "minddata/dataset/engine/datasetops/cache_merge_op.h"
 #include "minddata/dataset/engine/datasetops/cache_lookup_op.h"
@ -261,5 +262,10 @@ Status NodePass::PreRunOnNode(std::shared_ptr<BuildVocabOp> node, bool *modified
  // Fallback to base class visitor by default
  return PreRunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
 }
+
+Status NodePass::PreRunOnNode(std::shared_ptr<BuildSentencePieceVocabOp> node, bool *modified) {
+  // Fallback to base class visitor by default
+  return PreRunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h
@ -81,6 +81,8 @@ class EpochCtrlOp;

 class BuildVocabOp;

+class BuildSentencePieceVocabOp;
+
 // The base class Pass is the basic unit of tree transformation.
 // The actual implementation of the passes will be derived from here.
 class Pass : public std::enable_shared_from_this<Pass> {
@ -206,6 +208,8 @@ class NodePass : public Pass {

  virtual Status PreRunOnNode(std::shared_ptr<BuildVocabOp> node, bool *modified);

+  virtual Status PreRunOnNode(std::shared_ptr<BuildSentencePieceVocabOp> node, bool *modified);
+
 private:
  // Helper function to perform DFS visit
  Status DFSNodeVisit(std::shared_ptr<DatasetOp> node, bool *modified);
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/injection_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/injection_pass.cc
@ -37,6 +37,16 @@ Status InjectionPass::InjectionFinder::PreRunOnNode(std::shared_ptr<BuildVocabOp
  }
 }

+// Performs finder work for BuildSentencePieceVocabOp that has special rules about epoch control injection
+Status InjectionPass::InjectionFinder::PreRunOnNode(std::shared_ptr<BuildSentencePieceVocabOp> node, bool *modified) {
+  if (injection_pass_) {
+    injection_pass_->epoch_ctrl_bypass_ = true;
+    return Status::OK();
+  } else {
+    RETURN_STATUS_UNEXPECTED("Missing outer injection pass object from inside InjectionFinder!");
+  }
+}
+
 // Temporary code to prevent the injection of epoch control when cache op is present
 // Remove this code in cache op phase 2
 Status InjectionPass::InjectionFinder::PreRunOnNode(std::shared_ptr<CacheOp> node, bool *modified) {
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/injection_pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/injection_pass.h
@ -45,6 +45,12 @@ class InjectionPass : public TreePass {
    /// \return Status The error code return
    Status PreRunOnNode(std::shared_ptr<BuildVocabOp> node, bool *modified) override;

+    /// \brief Performs finder work for BuildSentencePieceVocabOp that has special rules about epoch control injection.
+    /// \param[in] node The node being visited
+    /// \param[inout] modified Indicator if the node was changed at all
+    /// \return Status The error code return
+    Status PreRunOnNode(std::shared_ptr<BuildSentencePieceVocabOp> node, bool *modified) override;
+
    /// \brief Temporary code to prevent the injection of epoch control when cache op is present.
    ///     Remove this code in cache op phase 2
    /// \param[in] node The node being visited
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@ -136,6 +136,7 @@ constexpr char kRandomChoiceOp[] = "RandomChoiceOp";
 constexpr char kRandomApplyOp[] = "RandomApplyOp";
 constexpr char kComposeOp[] = "ComposeOp";
 constexpr char kRandomSelectSubpolicyOp[] = "RandomSelectSubpolicyOp";
+constexpr char kSentencepieceTokenizerOp[] = "SentencepieceTokenizerOp";

 // data
 constexpr char kConcatenateOp[] = "kConcatenateOp";
--- a/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
@ -4,6 +4,7 @@ file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(text OBJECT
        vocab.cc
+        sentence_piece_vocab.cc
        )

 add_dependencies(text text-kernels)
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
@ -21,5 +21,6 @@ add_library(text-kernels OBJECT
        wordpiece_tokenizer_op.cc
        truncate_sequence_pair_op.cc
        to_number_op.cc
+        sentence_piece_tokenizer_op.cc
        ${ICU_DEPEND_FILES}
        )
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
+
+#include <memory>
+#include <vector>
+
+#include "common/utils.h"
+#include "minddata/dataset/util/path.h"
+
+namespace mindspore {
+namespace dataset {
+
+SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab,
+                                                   const SPieceTokenizerLoadType load_type,
+                                                   const SPieceTokenizerOutType out_type)
+    : vocab_(vocab), load_type_(load_type), out_type_(out_type) {}
+
+SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename,
+                                                   const SPieceTokenizerLoadType load_type,
+                                                   const SPieceTokenizerOutType out_type)
+    : load_type_(load_type), out_type_(out_type) {
+  (void)GetModelRealPath(model_path, model_filename);
+}
+
+Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
+  }
+
+  std::string_view sentence_v;
+  RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {}));
+  std::string sentence{sentence_v};
+  if (load_type_ == SPieceTokenizerLoadType::kFile) {
+    auto status = processor_.Load(file_path_);
+    if (!status.ok()) {
+      RETURN_STATUS_UNEXPECTED("load sentence piece model failed.");
+    }
+  } else {
+    RETURN_UNEXPECTED_IF_NULL(vocab_);
+    auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto());
+    if (!status.ok()) {
+      RETURN_STATUS_UNEXPECTED("sentence piece load model failed.");
+    }
+  }
+
+  if (out_type_ == SPieceTokenizerOutType::kString) {
+    std::vector<std::string> pieces;
+    auto status = processor_.Encode(sentence, &pieces);
+    if (!status.ok()) {
+      RETURN_STATUS_UNEXPECTED("sentence piece tokenizer error");
+    }
+    *output = std::make_unique<Tensor>(pieces, TensorShape({(dsize_t)pieces.size()}));
+  } else {
+    std::vector<int> ids;
+    auto status = processor_.Encode(sentence, &ids);
+    if (!status.ok()) {
+      RETURN_STATUS_UNEXPECTED("sentence piece tokenizer error");
+    }
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(output, ids, TensorShape({(dsize_t)ids.size()})));
+  }
+  return Status::OK();
+}
+
+Status SentencePieceTokenizerOp::GetModelRealPath(const std::string &model_path, const std::string &filename) {
+  char real_path[PATH_MAX] = {0};
+  if (file_path_.size() >= PATH_MAX) {
+    RETURN_STATUS_UNEXPECTED("sentence piece model path is invalid.");
+  }
+#if defined(_WIN32) || defined(_WIN64)
+  if (_fullpath(real_path, common::SafeCStr(model_path), PATH_MAX) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("sentence piece model path is invalid.");
+  }
+#else
+  if (realpath(common::SafeCStr(model_path), real_path) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("sentence piece model path  is invalid.");
+  }
+#endif
+  std::string abs_path = real_path;
+  file_path_ = (Path(abs_path) / Path(filename)).toString();
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
@ -0,0 +1,65 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
+#define DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
+
+#include <sentencepiece_processor.h>
+
+#include <string>
+#include <iostream>
+#include <memory>
+
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+#include "minddata/dataset/text/sentence_piece_vocab.h"
+
+namespace mindspore {
+namespace dataset {
+enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
+enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
+
+class SentencePieceTokenizerOp : public TensorOp {
+ public:
+  SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab, SPieceTokenizerLoadType load_type,
+                           const SPieceTokenizerOutType out_type);
+
+  SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename,
+                           const SPieceTokenizerLoadType load_type, const SPieceTokenizerOutType out_type);
+
+  ~SentencePieceTokenizerOp() override = default;
+
+  Status GetModelRealPath(const std::string &model_path, const std::string &filename);
+
+  void Print(std::ostream &out) const override {
+    out << "SentencePieceTokenizerOp out_type = " << out_type_ << " load_type = " << load_type_;
+  }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  std::string Name() const override { return kSentencepieceTokenizerOp; }
+
+ protected:
+  SPieceTokenizerOutType out_type_;
+  std::shared_ptr<SentencePieceVocab> vocab_;
+  std::string file_path_;
+  SPieceTokenizerLoadType load_type_;
+  sentencepiece::SentencePieceProcessor processor_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
@ -0,0 +1,112 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/text/sentence_piece_vocab.h"
+
+#include <sentencepiece_trainer.h>
+#include <sentencepiece_processor.h>
+#include <fstream>
+
+#include "common/utils.h"
+#include "minddata/dataset/util/path.h"
+
+namespace mindspore {
+namespace dataset {
+
+SentencePieceVocab::SentencePieceVocab() : model_proto_("") {}
+
+Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
+                                         const float character_coverage, const SentencePieceModel model_type,
+                                         const std::unordered_map<std::string, std::string> &params,
+                                         std::shared_ptr<SentencePieceVocab> *vocab) {
+  std::unordered_map<std::string, std::string> unorder_map;
+
+  // the input of sentence is comma separated string
+  std::string input_str = "";
+  for (auto path : path_list) {
+    input_str += path;
+    input_str += ",";
+  }
+  input_str.pop_back();
+  unorder_map["input"] = input_str;
+  unorder_map["vocab_size"] = std::to_string(vocab_size);
+  unorder_map["model_prefix"] = "";
+  unorder_map["minloglevel"] = "1";
+  unorder_map["character_coverage"] = std::to_string(character_coverage);
+  if (model_type == SentencePieceModel::kWord) {
+    unorder_map["model_type"] = "WORD";
+  } else if (model_type == SentencePieceModel::kBpe) {
+    unorder_map["model_type"] = "BPE";
+  } else if (model_type == SentencePieceModel::kChar) {
+    unorder_map["model_type"] = "CHAR";
+  } else {
+    unorder_map["model_type"] = "UNIGRAM";
+  }
+
+  // filter some params that set by function param
+  // filter  model_prefix that must be empty
+  for (auto param : params) {
+    std::string key = param.first;
+    if (key == "input" || key == "vocab_size" || key == "model_prefix" || key == "character_coverage" ||
+        key == "model_type") {
+      continue;
+    }
+    unorder_map[key] = param.second;
+  }
+
+  // set sentence lib's log
+  unorder_map["minloglevel"] = "1";
+  *vocab = std::make_shared<SentencePieceVocab>();
+  std::string model_proto;
+  sentencepiece::util::Status s_status = sentencepiece::SentencePieceTrainer::Train(unorder_map, nullptr, &model_proto);
+  if (!s_status.ok()) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, s_status.message());
+  }
+  vocab->get()->set_model_proto(model_proto);
+
+  return Status::OK();
+}
+
+Status SentencePieceVocab::SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,
+                                     std::string filename) {
+  char real_path[PATH_MAX] = {0};
+
+  if (path.size() >= PATH_MAX) {
+    RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
+  }
+#if defined(_WIN32) || defined(_WIN64)
+  if (_fullpath(real_path, common::SafeCStr(path), PATH_MAX) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
+  }
+#else
+  if (realpath(common::SafeCStr(path), real_path) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
+  }
+#endif
+
+  std::string abs_real_path = (Path(real_path) / Path(filename)).toString();
+  std::ofstream os_file(abs_real_path, std::ios::out);
+  (void)os_file.write(vocab->get()->model_proto().data(), vocab->get()->model_proto().size());
+  os_file.close();
+  return Status::OK();
+}
+
+const std::string &SentencePieceVocab::model_proto() { return model_proto_; }
+
+void SentencePieceVocab::set_model_proto(const std::string model_proto) { model_proto_ = model_proto; }
+
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
+++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
+#define DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
+class SentencePieceVocab {
+ public:
+  static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
+                              const float character_coverage, const SentencePieceModel model_type,
+                              const std::unordered_map<std::string, std::string> &params,
+                              std::shared_ptr<SentencePieceVocab> *vocab);
+  static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
+  SentencePieceVocab();
+
+  ~SentencePieceVocab() = default;
+
+  const std::string &model_proto();
+
+  void set_model_proto(const std::string model_proto);
+
+ private:
+  std::string model_proto_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -46,6 +46,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
    check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
    check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save
 from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
+from ..text.utils import DE_C_INTER_SENTENCEPIECE_MODE

 try:
    context = import_module("mindspore.context")
@ -909,6 +910,11 @@ class Dataset:
    def build_vocab(self, vocab, columns, freq_range, top_k, special_tokens, special_first):
        return BuildVocabDataset(self, vocab, columns, freq_range, top_k, special_tokens, special_first)

+    def build_sentencepiece_vocab(self, vocab, col_names, vocab_size,
+                                  character_coverage, model_type, params):
+        return BuildSentencePieceVocabDataset(self, vocab, col_names, vocab_size, character_coverage,
+                                              model_type, params)
+
    def apply(self, apply_func):
        """
        Apply a function in this dataset.
@ -5154,3 +5160,58 @@ class BuildVocabDataset(DatasetOp):
        new_op.special_first = copy.deepcopy(self.special_first)

        return new_op
+
+class BuildSentencePieceVocabDataset(DatasetOp):
+    """
+    Build a SentencePieceVocab from a dataset.
+    This function is not meant to be called directly by user. To build vocab, please use the function
+    text.SentencePieceVocab.from_dataset()
+
+    Args:
+        vocab(SentencePieceVocab): text.SentencePieceVocab object.
+        col_names(list): The list of the col name.
+        vocab_size(int): Vocabulary size, the type of uint32_t.
+        charater_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for languages
+            with rich character set like Japanse or Chinese and 1.0 for other languages with small character set.
+        model_type(SentencePieceModel): Model type.Choose from unigram (default), bpe, char, or word.
+            The input sentence must be pretokenized when using word type.
+        params(dict): A dictionary with no incoming parameters.
+    """
+
+    def __init__(self, input_dataset, vocab, col_names, vocab_size, character_coverage, model_type, params):
+        super().__init__()
+        self.vocab = vocab
+        self.col_names = col_names
+        self.vocab_size = vocab_size
+        self.children.append(input_dataset)
+        self.character_coverage = character_coverage
+        self.model_type = DE_C_INTER_SENTENCEPIECE_MODE[model_type]
+        self.params = params
+        input_dataset.parent.append(self)
+
+    def get_args(self):
+        args = super().get_args()
+        args["vocab"] = self.vocab
+        args["col_names"] = self.col_names
+        args["vocab_size"] = self.vocab_size
+        args["character_coverage"] = self.character_coverage
+        args["model_type"] = self.model_type
+        args["params"] = self.params
+        return args
+
+    def __deepcopy__(self, memodict):
+        if id(self) in memodict:
+            return memodict[id(self)]
+        cls = self.__class__
+        new_op = cls.__new__(cls)
+        memodict[id(self)] = new_op
+        new_op.children = copy.deepcopy(self.children, memodict)
+        new_op.col_names = copy.deepcopy(self.col_names, memodict)
+        new_op.num_parallel_workers = copy.deepcopy(self.num_parallel_workers, memodict)
+        new_op.vocab_size = copy.deepcopy(self.vocab_size, memodict)
+        new_op.parent = copy.deepcopy(self.parent, memodict)
+        new_op.character_coverage = copy.deepcopy(self.character_coverage, memodict)
+        new_op.params = copy.deepcopy(self.params, memodict)
+        new_op.vocab = self.vocab
+        new_op.model_type = copy.deepcopy(self.model_type)
+        return new_op
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@ -181,6 +181,8 @@ class Iterator:
            op_type = OpName.TEXTFILE
        elif isinstance(dataset, de.BuildVocabDataset):
            op_type = OpName.BUILDVOCAB
+        elif isinstance(dataset, de.BuildSentencePieceVocabDataset):
+            op_type = OpName.SENTENCEPIECEVOCAB
        elif isinstance(dataset, de.CLUEDataset):
            op_type = OpName.CLUE
        else:
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -19,13 +19,16 @@ utils provides some general methods for nlp text processing.
 """
 import platform
 from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
-    ToNumber, SlidingWindow
-from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
+    ToNumber, SlidingWindow, SentencePieceTokenizer
+from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
+    SPieceTokenizerOutType, SPieceTokenizerLoadType
+

 __all__ = [
    "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
    "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
-    "PythonTokenizer", "SlidingWindow"
+    "PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
+    "SentencePieceModel", "SPieceTokenizerLoadType"
 ]

 if platform.system().lower() != 'windows':
--- a/Show More
+++ b/Show More