add four new text API

5 years ago · 3bea84d0f7
parent 053bcd0266
commit 3bea84d0f7
5 changed files with 752 additions and 8 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -17,12 +17,20 @@
 #include <unistd.h>

 #include "minddata/dataset/include/text.h"
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/case_fold_op.h"
+#endif
 #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/lookup_op.h"
 #include "minddata/dataset/text/kernels/ngram_op.h"
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
+#endif
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/sliding_window_op.h"
+#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
 #ifndef _WIN32
+#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
 #endif
 #include "minddata/dataset/util/path.h"
@ -36,6 +44,14 @@ namespace text {
 // FUNCTIONS TO CREATE TEXT OPERATIONS
 // (In alphabetical order)

+#ifndef _WIN32
+std::shared_ptr<CaseFoldOperation> CaseFold() {
+  auto op = std::make_shared<CaseFoldOperation>();
+
+  return op->ValidateParams() ? op : nullptr;
+}
+#endif
+
 std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
                                                        const JiebaMode &mode, bool with_offsets) {
  auto op = std::make_shared<JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
@ -58,6 +74,14 @@ std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  return op->ValidateParams() ? op : nullptr;
 }

+#ifndef _WIN32
+std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form) {
+  auto op = std::make_shared<NormalizeUTF8Operation>(normalize_form);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+#endif
+
 std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
  auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
@ -78,7 +102,19 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const
  return op->ValidateParams() ? op : nullptr;
 }

+std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets) {
+  auto op = std::make_shared<UnicodeCharTokenizerOperation>(with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
 #ifndef _WIN32
+std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets) {
+  auto op = std::make_shared<UnicodeScriptTokenizerOperation>(keep_whitespace, with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
 std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets) {
  auto op = std::make_shared<WhitespaceTokenizerOperation>(with_offsets);

@ -116,6 +152,16 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s

 // (In alphabetical order)

+#ifndef _WIN32
+// CaseFoldOperation
+Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
+  std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
+  return tensor_op;
+}
+#endif
+
 // JiebaTokenizerOperation
 JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
                                                 const JiebaMode &mode, bool with_offsets)
@ -220,6 +266,18 @@ std::shared_ptr<TensorOp> NgramOperation::Build() {
  return tensor_op;
 }

+#ifndef _WIN32
+// NormalizeUTF8Operation
+NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
+
+Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
+  std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
+  return tensor_op;
+}
+#endif
+
 // SentencePieceTokenizerOperation
 SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
                                                                 SPieceTokenizerOutType out_type)
@ -283,7 +341,29 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
  return tensor_op;
 }

+// UnicodeCharTokenizerOperation
+UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
+
+Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
+  std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
+  return tensor_op;
+}
+
 #ifndef _WIN32
+// UnicodeScriptTokenizerOperation
+UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
+    : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
+
+Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
+  std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
+    std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
+  return tensor_op;
+}
+
 // WhitespaceTokenizerOperation
 WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}

--- a/mindspore/ccsrc/minddata/dataset/core/constants.h
+++ b/mindspore/ccsrc/minddata/dataset/core/constants.h
@ -59,6 +59,15 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
 // Possible values for SPieceTokenizerLoadType
 enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };

+// Possible values for NormalizeForm
+enum class NormalizeForm {
+  kNone = 0,
+  kNfc,
+  kNfkc,
+  kNfd,
+  kNfkd,
+};
+
 // convenience functions for 32bit int bitmask
 inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; }

--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@ -38,23 +38,41 @@ namespace dataset {
 namespace text {

 // Char arrays storing name of corresponding classes (in alphabetical order)
+constexpr char kCaseFoldOperation[] = "CaseFold";
 constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
 constexpr char kLookupOperation[] = "Lookup";
 constexpr char kNgramOperation[] = "Ngram";
+constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
 constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
 constexpr char kSlidingWindowOperation[] = "SlidingWindow";
+constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
+constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
 constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";

 // Text Op classes (in alphabetical order)
+#ifndef _WIN32
+class CaseFoldOperation;
+#endif
 class JiebaTokenizerOperation;
 class LookupOperation;
 class NgramOperation;
+#ifndef _WIN32
+class NormalizeUTF8Operation;
+#endif
 class SentencePieceTokenizerOperation;
 class SlidingWindowOperation;
+class UnicodeCharTokenizerOperation;
 #ifndef _WIN32
+class UnicodeScriptTokenizerOperation;
 class WhitespaceTokenizerOperation;
 #endif

+#ifndef _WIN32
+/// \brief Apply case fold operation on UTF-8 string tensor.
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<CaseFoldOperation> CaseFold();
+#endif
+
 /// \brief Tokenize Chinese string into words based on dictionary.
 /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
 ///   official website of cppjieba.
@ -94,6 +112,21 @@ std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
                                      const std::pair<std::string, int32_t> &right_pad = {"", 0},
                                      const std::string &separator = " ");

+#ifndef _WIN32
+/// \brief Apply normalize operation on UTF-8 string tensor.
+/// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
+///   NormalizeForm::kNfkc,
+///   NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
+///   See http://unicode.org/reports/tr15/ for details.
+///   - NormalizeForm.NONE, do nothing for input string tensor.
+///   - NormalizeForm.NFC, normalize with Normalization Form C.
+///   - NormalizeForm.NFKC, normalize with Normalization Form KC.
+///   - NormalizeForm.NFD, normalize with Normalization Form D.
+///   - NormalizeForm.NFKD, normalize with Normalization Form KD.
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
+#endif
+
 /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
 /// \param[in] vocab a SentencePieceVocab object.
 /// \param[in] out_type The type of output.
@ -116,8 +149,20 @@ std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);

+/// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets = false);
+
 #ifndef _WIN32
-/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces
+/// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
+/// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace = false,
+                                                                        bool with_offsets = false);
+
+/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
 /// \param[in] with_offsets If or not output offsets of tokens (default=false).
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
@ -125,6 +170,21 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs

 /* ####################################### Derived TensorOperation classes ################################# */

+#ifndef _WIN32
+class CaseFoldOperation : public TensorOperation {
+ public:
+  CaseFoldOperation() = default;
+
+  ~CaseFoldOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kCaseFoldOperation; }
+};
+#endif
+
 class JiebaTokenizerOperation : public TensorOperation {
 public:
  explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
@ -185,6 +245,24 @@ class NgramOperation : public TensorOperation {
  std::string separator_;
 };

+#ifndef _WIN32
+class NormalizeUTF8Operation : public TensorOperation {
+ public:
+  explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
+
+  ~NormalizeUTF8Operation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kNormalizeUTF8Operation; }
+
+ private:
+  NormalizeForm normalize_form_;
+};
+#endif
+
 class SentencePieceTokenizerOperation : public TensorOperation {
 public:
  SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
@ -223,7 +301,40 @@ class SlidingWindowOperation : public TensorOperation {
  int32_t axis_;
 };

+class UnicodeCharTokenizerOperation : public TensorOperation {
+ public:
+  explicit UnicodeCharTokenizerOperation(bool with_offsets);
+
+  ~UnicodeCharTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kUnicodeCharTokenizerOperation; }
+
+ private:
+  bool with_offsets_;
+};
+
 #ifndef _WIN32
+class UnicodeScriptTokenizerOperation : public TensorOperation {
+ public:
+  explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
+
+  ~UnicodeScriptTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
+
+ private:
+  bool keep_whitespace_;
+  bool with_offsets_;
+};
+
 class WhitespaceTokenizerOperation : public TensorOperation {
 public:
  explicit WhitespaceTokenizerOperation(bool with_offsets);
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h
@ -24,13 +24,6 @@

 namespace mindspore {
 namespace dataset {
-enum class NormalizeForm {
-  kNone = 0,
-  kNfc,
-  kNfkc,
-  kNfd,
-  kNfkd,
-};

 class NormalizeUTF8Op : public TensorOp {
 public:
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc