Add WhitespaceTokenizer and UnicodeScriptTokenizer for nlp

add CaseFold, NormalizeUTF8

add RegexReplace

add RegexTokenizer

add BasicTokenizer

add WordpieceTokenizer

add BertTokenizer
pull/2092/head
qianlong 5 years ago
parent ea37dc76f0
commit 4f16f036be

File diff suppressed because it is too large Load Diff

@ -0,0 +1,19 @@
set(LIB_ICU_COMMON icuuc)
set(LIB_ICU_DATA icudata)
set(LIB_ICU_I18N icui18n)
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
message("icu4c thirdparty do not support windows currently.")
else()
mindspore_add_pkg(icu4c
VER 67.1
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
MD5 0c2662a2b0bc80b0eb56495205247c8f
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
)
include_directories(${icu4c_INC})
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
add_definitions(-D ENABLE_ICU4C)
endif()

@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
endif()
if (ENABLE_MINDDATA)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)

@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
message("icu4c does not support windows system temporarily")
else()
file(GLOB_RECURSE ICU4C_LIB_LIST
${icu4c_LIBPATH}/libicuuc*
${icu4c_LIBPATH}/libicudata*
${icu4c_LIBPATH}/libicui18n*
)
install(
FILES ${ICU4C_LIB_LIST}
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif()
endif ()
if (ENABLE_CPU)

@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
else()
set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
endif()
target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
mindspore::opencv_imgproc mindspore::tinyxml2)
mindspore::opencv_imgproc mindspore::tinyxml2 ${ICU_LIB})
if (ENABLE_GPUQUE)
target_link_libraries(_c_dataengine PRIVATE gpu_queue
${CUDNN_PATH}/lib64/libcudnn.so

@ -65,8 +65,21 @@
#include "dataset/text/kernels/jieba_tokenizer_op.h"
#include "dataset/text/kernels/ngram_op.h"
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "dataset/text/vocab.h"
#include "dataset/text/kernels/lookup_op.h"
#ifdef ENABLE_ICU4C
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include "dataset/text/kernels/bert_tokenizer_op.h"
#include "dataset/text/kernels/case_fold_op.h"
#include "dataset/text/kernels/normalize_utf8_op.h"
#include "dataset/text/kernels/regex_replace_op.h"
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
#include "dataset/util/random.h"
#include "mindrecord/include/shard_operator.h"
#include "mindrecord/include/shard_pk_sample.h"
@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) {
py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
}
void bindTensorOps5(py::module *m) {
void bindTokenizerOps(py::module *m) {
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
py::arg("mode") = JiebaMode::kMix)
@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) {
const std::string &>(),
py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
py::arg("separator"));
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
}
void bindDependIcuTokenizerOps(py::module *m) {
#ifdef ENABLE_ICU4C
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
.def(py::init<>());
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
.def(py::init<>())
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
.def(py::init<>());
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
*m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
.def(py::init<>())
.def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
*m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
.def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
py::arg("replace_all"));
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
"Tokenizer used for Bert text process.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
NormalizeForm, bool>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
#endif
}
void bindSamplerOps(py::module *m) {
@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
.export_values();
#ifdef ENABLE_ICU4C
(void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
.export_values();
#endif
(void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
.value("DE_INTER_LINEAR", InterpolationMode::kLinear)
.value("DE_INTER_CUBIC", InterpolationMode::kCubic)
@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
bindTensorOps2(&m);
bindTensorOps3(&m);
bindTensorOps4(&m);
bindTensorOps5(&m);
bindTokenizerOps(&m);
bindSamplerOps(&m);
bindDatasetOps(&m);
bindInfoObjects(&m);
bindVocabObjects(&m);
bindGraphData(&m);
bindDependIcuTokenizerOps(&m);
}
} // namespace dataset
} // namespace mindspore

@ -1,8 +1,21 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
set(ICU_DEPEND_FILES
basic_tokenizer_op.cc
bert_tokenizer_op.cc
case_fold_op.cc
normalize_utf8_op.cc
regex_replace_op.cc
regex_tokenizer_op.cc
unicode_script_tokenizer_op.cc
whitespace_tokenizer_op.cc)
endif()
add_library(text-kernels OBJECT
lookup_op.cc
jieba_tokenizer_op.cc
unicode_char_tokenizer_op.cc
ngram_op.cc
wordpiece_tokenizer_op.cc
${ICU_DEPEND_FILES}
)

@ -0,0 +1,93 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
const bool BasicTokenizerOp::kDefLowerCase = false;
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
const char BasicTokenizerOp::kCommonPattern[] =
"[!-/]"
"|[:-@]"
"|[\\[-`]"
"|[{-~]"
"|[\\p{P}]"
"|[\\x{4E00}-\\x{9FFF}]"
"|[\\x{3400}-\\x{4DBF}]"
"|[\\x{20000}-\\x{2A6DF}]"
"|[\\x{2A700}-\\x{2B73F}]"
"|[\\x{2B740}-\\x{2B81F}]"
"|[\\x{2B820}-\\x{2CEAF}]"
"|[\\x{F900}-\\x{FAFF}]"
"|[\\x{2F800}-\\x{2FA1F}]";
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
bool preserve_unused_token)
: lower_case_(lower_case),
keep_whitespace_(keep_whitespace),
preserve_unused_token_(preserve_unused_token),
case_fold_(std::make_unique<CaseFoldOp>()),
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
std::string keep_delim_pattern;
if (keep_whitespace_) {
keep_delim_pattern = delim_pattern;
} else {
keep_delim_pattern = kCommonPattern;
}
if (preserve_unused_token_) {
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
delim_pattern = kUnusedPattern + delim_pattern;
}
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
}
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::shared_ptr<Tensor> cur_input;
std::shared_ptr<Tensor> processed_tensor;
if (lower_case_) {
// to lower case
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
cur_input = processed_tensor;
// strip accent characters
RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
} else {
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
}
// strip control characters
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
return regex_tokenizer_->Compute(processed_tensor, output);
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,64 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/text/kernels/case_fold_op.h"
#include "dataset/text/kernels/normalize_utf8_op.h"
#include "dataset/text/kernels/regex_replace_op.h"
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class BasicTokenizerOp : public TensorOp {
public:
static const bool kDefLowerCase;
static const bool kDefKeepWhitespace;
static const NormalizeForm kDefNormalizationForm;
static const bool kDefPreserveUnusedToken;
BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
NormalizeForm normalization_form = kDefNormalizationForm,
bool preserve_unused_token = kDefPreserveUnusedToken);
~BasicTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
static const char kCommonPattern[];
static const char kUnusedPattern[];
bool lower_case_;
bool keep_whitespace_;
NormalizeForm normalization_form_;
bool preserve_unused_token_;
std::unique_ptr<CaseFoldOp> case_fold_;
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
std::unique_ptr<NormalizeUTF8Op> common_normalize_;
std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
std::unique_ptr<RegexReplaceOp> replace_control_chars_;
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_

@ -0,0 +1,27 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/bert_tokenizer_op.h"
namespace mindspore {
namespace dataset {
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
std::shared_ptr<Tensor> basic_tensor;
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class BertTokenizerOp : public TensorOp {
public:
BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
bool lower_case = BasicTokenizerOp::kDefLowerCase,
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
~BertTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
WordpieceTokenizerOp wordpiece_tokenizer_;
BasicTokenizerOp basic_tokenizer_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_

@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/case_fold_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/utypes.h"
namespace mindspore {
namespace dataset {
Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
icu::ErrorCode error;
const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
icu::StringByteSink<std::string> sink(&strs[i++]);
nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,39 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class CaseFoldOp : public TensorOp {
public:
CaseFoldOp() {}
~CaseFoldOp() override = default;
void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_

@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
}
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {

@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
: vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}
Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
RETURN_UNEXPECTED_IF_NULL(vocab_);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
std::vector<WordIdType> word_ids;

@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_le
separator_(separator) {}
Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
std::vector<int32_t> offsets; // offsets for each str
std::vector<std::string> res; // holds the result of ngrams

@ -0,0 +1,75 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/normalize_utf8_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/utypes.h"
namespace mindspore {
namespace dataset {
const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
icu::ErrorCode error;
const icu::Normalizer2 *normalize = nullptr;
switch (normalize_form_) {
case NormalizeForm::kNone: {
*output = input;
return Status::OK();
}
case NormalizeForm::kNfc: {
normalize = icu::Normalizer2::getNFCInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
break;
}
case NormalizeForm::kNfkc: {
normalize = icu::Normalizer2::getNFKCInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
break;
}
case NormalizeForm::kNfd: {
normalize = icu::Normalizer2::getNFDInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
break;
}
case NormalizeForm::kNfkd: {
normalize = icu::Normalizer2::getNFKDInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
break;
}
default: {
RETURN_STATUS_UNEXPECTED("unexpected normalize form");
break;
}
}
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
icu::StringByteSink<std::string> sink(&strs[i++]);
normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,50 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
enum class NormalizeForm {
kNone = 0,
kNfc,
kNfkc,
kNfd,
kNfkd,
};
class NormalizeUTF8Op : public TensorOp {
public:
static const NormalizeForm kDefNormalizeForm;
explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
~NormalizeUTF8Op() override = default;
void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
NormalizeForm normalize_form_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_

@ -0,0 +1,57 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/regex_replace_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
std::string *out) const {
CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
UErrorCode icu_error = U_ZERO_ERROR;
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
matcher->reset(unicode_text);
icu::UnicodeString unicode_out;
if (replace_all_) {
unicode_out = matcher->replaceAll(replace_, icu_error);
} else {
unicode_out = matcher->replaceFirst(replace_, icu_error);
}
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
unicode_out.toUTF8String(*out);
return Status::OK();
}
Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
UErrorCode icu_error = U_ZERO_ERROR;
icu::RegexMatcher matcher(pattern_, 0, icu_error);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,55 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
#include <memory>
#include <string>
#include "unicode/regex.h"
#include "unicode/errorcode.h"
#include "unicode/utypes.h"
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class RegexReplaceOp : public TensorOp {
public:
RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
: pattern_(icu::UnicodeString::fromUTF8(pattern)),
replace_(icu::UnicodeString::fromUTF8(replace)),
replace_all_(replace_all) {}
~RegexReplaceOp() override = default;
void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
private:
const icu::UnicodeString pattern_;
const icu::UnicodeString replace_;
const bool replace_all_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_

@ -0,0 +1,103 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode) const {
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
int total_len = input.length();
int end = start + len;
CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
icu::UnicodeString temp;
input.extract(start, len, temp);
if (out_utf8 != nullptr) {
temp.toUTF8String(*out_utf8);
}
if (out_unicode != nullptr) {
*out_unicode = temp;
}
return Status::OK();
}
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
UErrorCode status = U_ZERO_ERROR;
out_tokens->clear();
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
token_matcher.reset(utext);
int token_start_index = 0;
status = U_ZERO_ERROR;
while (token_matcher.find(status) && U_SUCCESS(status)) {
int deli_start_index = token_matcher.start(status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
int deli_end_index = token_matcher.end(status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
// Add non-empty token
int token_len = deli_start_index - token_start_index;
if (token_len > 0) {
std::string token;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
out_tokens->emplace_back(std::move(token));
}
int delim_len = deli_end_index - deli_start_index;
if (keep_delim_ && delim_len > 0) {
icu::UnicodeString delim_str;
std::string delim_utf8_str;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
delim_matcher.reset(delim_str);
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
out_tokens->emplace_back(std::move(delim_utf8_str));
}
}
token_start_index = deli_end_index;
}
if (token_start_index < utext.length()) {
std::string temp;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
out_tokens->emplace_back(std::move(temp));
}
return Status::OK();
}
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view text;
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
std::vector<std::string> tokens;
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -0,0 +1,58 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include <vector>
#include "unicode/regex.h"
#include "unicode/errorcode.h"
#include "unicode/utypes.h"
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class RegexTokenizerOp : public TensorOp {
public:
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
keep_delim_(!keep_delim_pattern.empty()) {}
~RegexTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode = nullptr) const;
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
private:
const icu::UnicodeString delim_pattern_;
const icu::UnicodeString keep_delim_pattern_;
const bool keep_delim_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_REGEX_TOKENIZER_OP_H_

@ -28,6 +28,7 @@ namespace mindspore {
namespace dataset {
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {
} // namespace dataset
} // namespace mindspore
#endif // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#endif // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_

@ -0,0 +1,93 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "cppjieba/Unicode.hpp"
#include "unicode/errorcode.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
using cppjieba::DecodeRunesInString;
using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
UScriptCode last_script = USCRIPT_INVALID_CODE;
icu::ErrorCode status;
int start = 0;
int len = 0;
std::vector<std::string> splits;
bool was_space = false;
for (size_t i = 0; i < runes.size(); i++) {
bool is_space = u_isUWhiteSpace(runes[i].rune);
UScriptCode script = uscript_getScript(runes[i].rune, status);
if (status.isFailure()) {
status.reset();
script = USCRIPT_INVALID_CODE;
}
// 1) Seperate UTF-8 strings of different UScriptCode values
// (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
// 2) Seperate whitespace and non-whitespace UTF-8 strings
// (such as: " ." should be split to [" ", "."])
if (len > 0 && (script != last_script || is_space != was_space)) {
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
if (keep_whitespace_ || !was_space) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
start = runes[i].offset;
len = runes[i].len;
} else {
len += runes[i].len;
}
last_script = script;
was_space = is_space;
}
if (len > 0 && (keep_whitespace_ || !was_space)) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
// 4) If the input is empty scalar string, the output will be 1-D empty string.
if (splits.empty()) {
splits.emplace_back("");
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save