!9024 Add four text C++ API

From: @shenwei41 Reviewed-by: Signed-off-by:
4 years ago · 3bed28822d
parent aa2296bcc1 4e56618d18
commit 3bed28822d
7 changed files with 865 additions and 4 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -15,9 +15,16 @@
 */

 #include <unistd.h>
+
 #include "minddata/dataset/include/text.h"
+#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/lookup_op.h"
+#include "minddata/dataset/text/kernels/ngram_op.h"
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/sliding_window_op.h"
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
+#endif
 #include "minddata/dataset/util/path.h"

 namespace mindspore {
@ -29,6 +36,13 @@ namespace text {
 // FUNCTIONS TO CREATE TEXT OPERATIONS
 // (In alphabetical order)

+std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
+                                                        const JiebaMode &mode, bool with_offsets) {
+  auto op = std::make_shared<JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
 std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
                                        const DataType &data_type) {
  auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
@ -36,6 +50,14 @@ std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, con
  return op->ValidateParams() ? op : nullptr;
 }

+std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
+                                      const std::pair<std::string, int32_t> &left_pad,
+                                      const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
+  auto op = std::make_shared<NgramOperation>(ngrams, left_pad, right_pad, separator);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
 std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
  auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
@ -50,12 +72,79 @@ std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const st
  return op->ValidateParams() ? op : nullptr;
 }

+std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis) {
+  auto op = std::make_shared<SlidingWindowOperation>(width, axis);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
+#ifndef _WIN32
+std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets) {
+  auto op = std::make_shared<WhitespaceTokenizerOperation>(with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+#endif
+
 /* ####################################### Validator Functions ############################################ */

+// Helper function to validate tokenizer directory parameter
+Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
+  if (tokenizer_file.empty()) {
+    std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  Path file(tokenizer_file);
+  if (!file.Exists()) {
+    std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (access(tokenizer_file.c_str(), R_OK) == -1) {
+    std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
 /* ####################################### Derived TensorOperation classes ################################# */

 // (In alphabetical order)

+// JiebaTokenizerOperation
+JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
+                                                 const JiebaMode &mode, bool with_offsets)
+    : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
+
+Status JiebaTokenizerOperation::ValidateParams() {
+  if (hmm_path_.empty()) {
+    std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (mp_path_.empty()) {
+    std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
+  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
+  std::shared_ptr<JiebaTokenizerOp> tensor_op =
+    std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
+  return tensor_op;
+}
+
 // LookupOperation
 LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
                                 const DataType &data_type)
@ -83,6 +172,54 @@ std::shared_ptr<TensorOp> LookupOperation::Build() {
  return tensor_op;
 }

+// NgramOperation
+NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
+                               const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
+    : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
+
+Status NgramOperation::ValidateParams() {
+  if (ngrams_.size() == 0) {
+    std::string err_msg = "Ngram : Container cannot be empty.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  } else {
+    for (int32_t i = 0; i < ngrams_.size(); ++i) {
+      if (ngrams_[i] <= 0) {
+        std::string err_msg =
+          "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
+        MS_LOG(ERROR) << err_msg;
+        RETURN_STATUS_SYNTAX_ERROR(err_msg);
+      }
+    }
+  }
+
+  if (left_pad_.second < 0) {
+    std::string err_msg =
+      "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
+      std::to_string(left_pad_.second);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (right_pad_.second < 0) {
+    std::string err_msg =
+      "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
+      std::to_string(right_pad_.second);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> NgramOperation::Build() {
+  int32_t l_len = left_pad_.second;
+  int32_t r_len = right_pad_.second;
+  std::string l_pad = left_pad_.first;
+  std::string r_pad = right_pad_.first;
+  std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
+  return tensor_op;
+}
+
 // SentencePieceTokenizerOperation
 SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
                                                                 SPieceTokenizerOutType out_type)
@ -128,6 +265,36 @@ std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
  return tensor_op;
 }

+// SlidingWindowOperation
+SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
+
+Status SlidingWindowOperation::ValidateParams() {
+  if (width_ < 1) {
+    std::string err_msg =
+      "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
+  std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
+  return tensor_op;
+}
+
+#ifndef _WIN32
+// WhitespaceTokenizerOperation
+WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
+
+Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
+  std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
+  return tensor_op;
+}
+#endif
+
 }  // namespace text
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/core/constants.h
+++ b/mindspore/ccsrc/minddata/dataset/core/constants.h
@ -50,6 +50,15 @@ enum class ImageFormat { HWC = 0, CHW = 1, HW = 2 };
 // Possible interpolation modes
 enum class InterpolationMode { kLinear = 0, kNearestNeighbour = 1, kCubic = 2, kArea = 3 };

+// Possible JiebaMode modes
+enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
+
+// Possible values for SPieceTokenizerOutType
+enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
+
+// Possible values for SPieceTokenizerLoadType
+enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
+
 // convenience functions for 32bit int bitmask
 inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; }

--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@ -19,6 +19,7 @@

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

 #include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
@ -37,8 +38,29 @@ namespace dataset {
 namespace text {

 // Text Op classes (in alphabetical order)
+class JiebaTokenizerOperation;
 class LookupOperation;
+class NgramOperation;
 class SentencePieceTokenizerOperation;
+class SlidingWindowOperation;
+#ifndef _WIN32
+class WhitespaceTokenizerOperation;
+#endif
+
+/// \brief Tokenize Chinese string into words based on dictionary.
+/// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
+///   official website of cppjieba.
+/// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
+///   official website of cppjieba.
+/// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
+///   - JiebaMode.MP, tokenize with MPSegment algorithm.
+///   - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
+///   - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
+                                                        const JiebaMode &mode = JiebaMode::kMix,
+                                                        bool with_offsets = false);

 /// \brief Lookup operator that looks up a word to an id.
 /// \param[in] vocab a Vocab object.
@ -49,6 +71,21 @@ class SentencePieceTokenizerOperation;
 std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
                                        const mindspore::dataset::DataType &data_type = DataType("int32"));

+/// \brief TensorOp to generate n-gram from a 1-D string Tensor.
+/// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
+///   would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
+///   for a n-gram, an empty string will be returned.
+/// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
+///   be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
+/// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
+///   be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
+/// \param[in] separator Symbol used to join strings together (default=" ").
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
+                                      const std::pair<std::string, int32_t> &left_pad = {"", 0},
+                                      const std::pair<std::string, int32_t> &right_pad = {"", 0},
+                                      const std::string &separator = " ");
+
 /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
 /// \param[in] vocab a SentencePieceVocab object.
 /// \param[in] out_type The type of output.
@ -63,8 +100,41 @@ std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
 std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);

+/// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
+///   axis is a slice of data starting at the corresponding position, with a specified width.
+/// \param[in] width The width of the window. It must be an integer and greater than zero.
+/// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
+///   for now.
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);
+
+#ifndef _WIN32
+/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
+#endif
+
 /* ####################################### Derived TensorOperation classes ################################# */

+class JiebaTokenizerOperation : public TensorOperation {
+ public:
+  explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
+                                   bool with_offsets);
+
+  ~JiebaTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+ private:
+  std::string hmm_path_;
+  std::string mp_path_;
+  JiebaMode mode_;
+  bool with_offsets_;
+};
+
 class LookupOperation : public TensorOperation {
 public:
  explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
@ -83,6 +153,24 @@ class LookupOperation : public TensorOperation {
  DataType data_type_;
 };

+class NgramOperation : public TensorOperation {
+ public:
+  explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
+                          const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
+
+  ~NgramOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+ private:
+  std::vector<int32_t> ngrams_;
+  std::pair<std::string, int32_t> left_pad_;
+  std::pair<std::string, int32_t> right_pad_;
+  std::string separator_;
+};
+
 class SentencePieceTokenizerOperation : public TensorOperation {
 public:
  SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
@ -101,6 +189,37 @@ class SentencePieceTokenizerOperation : public TensorOperation {
  SPieceTokenizerLoadType load_type_;
  SPieceTokenizerOutType out_type_;
 };
+
+class SlidingWindowOperation : public TensorOperation {
+ public:
+  explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
+
+  ~SlidingWindowOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+ private:
+  int32_t width_;
+  int32_t axis_;
+};
+
+#ifndef _WIN32
+class WhitespaceTokenizerOperation : public TensorOperation {
+ public:
+  explicit WhitespaceTokenizerOperation(bool with_offsets);
+
+  ~WhitespaceTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+ private:
+  bool with_offsets_;
+};
+#endif
 }  // namespace text
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h
@ -20,14 +20,13 @@
 #include <memory>

 #include "cppjieba/Jieba.hpp"
+#include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {

-enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
-
 class JiebaTokenizerOp : public TensorOp {
 public:
  // default constant for Jieba MPSegment algorithm.
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
@ -23,14 +23,13 @@
 #include <iostream>
 #include <memory>

+#include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/text/sentence_piece_vocab.h"

 namespace mindspore {
 namespace dataset {
-enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
-enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };

 class SentencePieceTokenizerOp : public TensorOp {
 public:
--- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
@ -18,6 +18,7 @@
 #include <string>

 #include "common/common.h"
+#include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/include/datasets.h"
 #include "minddata/dataset/include/status.h"
 #include "minddata/dataset/include/transforms.h"
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc