add CaseFold, NormalizeUTF8 add RegexReplace add RegexTokenizer add BasicTokenizer add WordpieceTokenizer add BertTokenizerpull/2092/head
parent
ea37dc76f0
commit
4f16f036be
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
||||
set(LIB_ICU_COMMON icuuc)
|
||||
set(LIB_ICU_DATA icudata)
|
||||
set(LIB_ICU_I18N icui18n)
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
message("icu4c thirdparty do not support windows currently.")
|
||||
else()
|
||||
mindspore_add_pkg(icu4c
|
||||
VER 67.1
|
||||
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
|
||||
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
|
||||
MD5 0c2662a2b0bc80b0eb56495205247c8f
|
||||
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
|
||||
)
|
||||
include_directories(${icu4c_INC})
|
||||
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
|
||||
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
|
||||
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
|
||||
add_definitions(-D ENABLE_ICU4C)
|
||||
endif()
|
@ -1,8 +1,21 @@
|
||||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
|
||||
set(ICU_DEPEND_FILES
|
||||
basic_tokenizer_op.cc
|
||||
bert_tokenizer_op.cc
|
||||
case_fold_op.cc
|
||||
normalize_utf8_op.cc
|
||||
regex_replace_op.cc
|
||||
regex_tokenizer_op.cc
|
||||
unicode_script_tokenizer_op.cc
|
||||
whitespace_tokenizer_op.cc)
|
||||
endif()
|
||||
add_library(text-kernels OBJECT
|
||||
lookup_op.cc
|
||||
jieba_tokenizer_op.cc
|
||||
unicode_char_tokenizer_op.cc
|
||||
ngram_op.cc
|
||||
wordpiece_tokenizer_op.cc
|
||||
${ICU_DEPEND_FILES}
|
||||
)
|
||||
|
@ -0,0 +1,93 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
const bool BasicTokenizerOp::kDefLowerCase = false;
|
||||
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
|
||||
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
|
||||
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
|
||||
const char BasicTokenizerOp::kCommonPattern[] =
|
||||
"[!-/]"
|
||||
"|[:-@]"
|
||||
"|[\\[-`]"
|
||||
"|[{-~]"
|
||||
"|[\\p{P}]"
|
||||
"|[\\x{4E00}-\\x{9FFF}]"
|
||||
"|[\\x{3400}-\\x{4DBF}]"
|
||||
"|[\\x{20000}-\\x{2A6DF}]"
|
||||
"|[\\x{2A700}-\\x{2B73F}]"
|
||||
"|[\\x{2B740}-\\x{2B81F}]"
|
||||
"|[\\x{2B820}-\\x{2CEAF}]"
|
||||
"|[\\x{F900}-\\x{FAFF}]"
|
||||
"|[\\x{2F800}-\\x{2FA1F}]";
|
||||
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
|
||||
|
||||
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
|
||||
bool preserve_unused_token)
|
||||
: lower_case_(lower_case),
|
||||
keep_whitespace_(keep_whitespace),
|
||||
preserve_unused_token_(preserve_unused_token),
|
||||
case_fold_(std::make_unique<CaseFoldOp>()),
|
||||
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
|
||||
common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
|
||||
replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
|
||||
replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
|
||||
std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
|
||||
std::string keep_delim_pattern;
|
||||
if (keep_whitespace_) {
|
||||
keep_delim_pattern = delim_pattern;
|
||||
} else {
|
||||
keep_delim_pattern = kCommonPattern;
|
||||
}
|
||||
if (preserve_unused_token_) {
|
||||
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
|
||||
delim_pattern = kUnusedPattern + delim_pattern;
|
||||
}
|
||||
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
|
||||
}
|
||||
|
||||
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::shared_ptr<Tensor> cur_input;
|
||||
std::shared_ptr<Tensor> processed_tensor;
|
||||
if (lower_case_) {
|
||||
// to lower case
|
||||
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
|
||||
cur_input = processed_tensor;
|
||||
// strip accent characters
|
||||
RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
|
||||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
|
||||
}
|
||||
// strip control characters
|
||||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
|
||||
return regex_tokenizer_->Compute(processed_tensor, output);
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class BasicTokenizerOp : public TensorOp {
|
||||
public:
|
||||
static const bool kDefLowerCase;
|
||||
static const bool kDefKeepWhitespace;
|
||||
static const NormalizeForm kDefNormalizationForm;
|
||||
static const bool kDefPreserveUnusedToken;
|
||||
BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = kDefNormalizationForm,
|
||||
bool preserve_unused_token = kDefPreserveUnusedToken);
|
||||
|
||||
~BasicTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
static const char kCommonPattern[];
|
||||
static const char kUnusedPattern[];
|
||||
bool lower_case_;
|
||||
bool keep_whitespace_;
|
||||
NormalizeForm normalization_form_;
|
||||
bool preserve_unused_token_;
|
||||
std::unique_ptr<CaseFoldOp> case_fold_;
|
||||
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
|
||||
std::unique_ptr<NormalizeUTF8Op> common_normalize_;
|
||||
std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
|
||||
std::unique_ptr<RegexReplaceOp> replace_control_chars_;
|
||||
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
@ -0,0 +1,27 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/bert_tokenizer_op.h"
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
std::shared_ptr<Tensor> basic_tensor;
|
||||
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
|
||||
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,54 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class BertTokenizerOp : public TensorOp {
|
||||
public:
|
||||
BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
|
||||
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
|
||||
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
|
||||
bool lower_case = BasicTokenizerOp::kDefLowerCase,
|
||||
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
|
||||
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
|
||||
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
|
||||
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
|
||||
|
||||
~BertTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
WordpieceTokenizerOp wordpiece_tokenizer_;
|
||||
BasicTokenizerOp basic_tokenizer_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
@ -0,0 +1,46 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
icu::ErrorCode error;
|
||||
const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
icu::StringByteSink<std::string> sink(&strs[i++]);
|
||||
nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,39 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class CaseFoldOp : public TensorOp {
|
||||
public:
|
||||
CaseFoldOp() {}
|
||||
|
||||
~CaseFoldOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
|
||||
Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
icu::ErrorCode error;
|
||||
const icu::Normalizer2 *normalize = nullptr;
|
||||
switch (normalize_form_) {
|
||||
case NormalizeForm::kNone: {
|
||||
*output = input;
|
||||
return Status::OK();
|
||||
}
|
||||
case NormalizeForm::kNfc: {
|
||||
normalize = icu::Normalizer2::getNFCInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfkc: {
|
||||
normalize = icu::Normalizer2::getNFKCInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfd: {
|
||||
normalize = icu::Normalizer2::getNFDInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfkd: {
|
||||
normalize = icu::Normalizer2::getNFKDInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
RETURN_STATUS_UNEXPECTED("unexpected normalize form");
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
icu::StringByteSink<std::string> sink(&strs[i++]);
|
||||
normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,50 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
enum class NormalizeForm {
|
||||
kNone = 0,
|
||||
kNfc,
|
||||
kNfkc,
|
||||
kNfd,
|
||||
kNfkd,
|
||||
};
|
||||
|
||||
class NormalizeUTF8Op : public TensorOp {
|
||||
public:
|
||||
static const NormalizeForm kDefNormalizeForm;
|
||||
explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
|
||||
|
||||
~NormalizeUTF8Op() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
NormalizeForm normalize_form_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
|
||||
std::string *out) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
|
||||
UErrorCode icu_error = U_ZERO_ERROR;
|
||||
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
|
||||
matcher->reset(unicode_text);
|
||||
icu::UnicodeString unicode_out;
|
||||
if (replace_all_) {
|
||||
unicode_out = matcher->replaceAll(replace_, icu_error);
|
||||
} else {
|
||||
unicode_out = matcher->replaceFirst(replace_, icu_error);
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
|
||||
unicode_out.toUTF8String(*out);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
UErrorCode icu_error = U_ZERO_ERROR;
|
||||
icu::RegexMatcher matcher(pattern_, 0, icu_error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,55 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class RegexReplaceOp : public TensorOp {
|
||||
public:
|
||||
RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
|
||||
: pattern_(icu::UnicodeString::fromUTF8(pattern)),
|
||||
replace_(icu::UnicodeString::fromUTF8(replace)),
|
||||
replace_all_(replace_all) {}
|
||||
|
||||
~RegexReplaceOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
|
||||
|
||||
private:
|
||||
const icu::UnicodeString pattern_;
|
||||
const icu::UnicodeString replace_;
|
||||
const bool replace_all_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
@ -0,0 +1,103 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
|
||||
int total_len = input.length();
|
||||
int end = start + len;
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
|
||||
icu::UnicodeString temp;
|
||||
input.extract(start, len, temp);
|
||||
if (out_utf8 != nullptr) {
|
||||
temp.toUTF8String(*out_utf8);
|
||||
}
|
||||
if (out_unicode != nullptr) {
|
||||
*out_unicode = temp;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
out_tokens->clear();
|
||||
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
|
||||
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
|
||||
token_matcher.reset(utext);
|
||||
|
||||
int token_start_index = 0;
|
||||
status = U_ZERO_ERROR;
|
||||
while (token_matcher.find(status) && U_SUCCESS(status)) {
|
||||
int deli_start_index = token_matcher.start(status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
|
||||
int deli_end_index = token_matcher.end(status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
|
||||
|
||||
// Add non-empty token
|
||||
int token_len = deli_start_index - token_start_index;
|
||||
if (token_len > 0) {
|
||||
std::string token;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
|
||||
out_tokens->emplace_back(std::move(token));
|
||||
}
|
||||
|
||||
int delim_len = deli_end_index - deli_start_index;
|
||||
if (keep_delim_ && delim_len > 0) {
|
||||
icu::UnicodeString delim_str;
|
||||
std::string delim_utf8_str;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
|
||||
delim_matcher.reset(delim_str);
|
||||
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
|
||||
out_tokens->emplace_back(std::move(delim_utf8_str));
|
||||
}
|
||||
}
|
||||
token_start_index = deli_end_index;
|
||||
}
|
||||
|
||||
if (token_start_index < utext.length()) {
|
||||
std::string temp;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
|
||||
out_tokens->emplace_back(std::move(temp));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view text;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
|
||||
std::vector<std::string> tokens;
|
||||
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
|
||||
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,58 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class RegexTokenizerOp : public TensorOp {
|
||||
public:
|
||||
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
|
||||
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
|
||||
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
|
||||
keep_delim_(!keep_delim_pattern.empty()) {}
|
||||
|
||||
~RegexTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode = nullptr) const;
|
||||
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
|
||||
|
||||
private:
|
||||
const icu::UnicodeString delim_pattern_;
|
||||
const icu::UnicodeString keep_delim_pattern_;
|
||||
const bool keep_delim_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
@ -0,0 +1,93 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
using cppjieba::DecodeRunesInString;
|
||||
using cppjieba::RuneStrArray;
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
|
||||
|
||||
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
|
||||
UScriptCode last_script = USCRIPT_INVALID_CODE;
|
||||
icu::ErrorCode status;
|
||||
int start = 0;
|
||||
int len = 0;
|
||||
std::vector<std::string> splits;
|
||||
|
||||
bool was_space = false;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
bool is_space = u_isUWhiteSpace(runes[i].rune);
|
||||
UScriptCode script = uscript_getScript(runes[i].rune, status);
|
||||
if (status.isFailure()) {
|
||||
status.reset();
|
||||
script = USCRIPT_INVALID_CODE;
|
||||
}
|
||||
// 1) Seperate UTF-8 strings of different UScriptCode values
|
||||
// (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
|
||||
// 2) Seperate whitespace and non-whitespace UTF-8 strings
|
||||
// (such as: " ." should be split to [" ", "."])
|
||||
if (len > 0 && (script != last_script || is_space != was_space)) {
|
||||
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
|
||||
if (keep_whitespace_ || !was_space) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
start = runes[i].offset;
|
||||
len = runes[i].len;
|
||||
} else {
|
||||
len += runes[i].len;
|
||||
}
|
||||
last_script = script;
|
||||
was_space = is_space;
|
||||
}
|
||||
|
||||
if (len > 0 && (keep_whitespace_ || !was_space)) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
// 4) If the input is empty scalar string, the output will be 1-D empty string.
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue