!2941 MD tokenizer support output offsets

Merge pull request !2941 from xiefangqi/md_add_offsets_feature
pull/2941/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 6284c42a76

@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) {
void bindTokenizerOps(py::module *m) {
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
py::arg("mode") = JiebaMode::kMix)
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>(), py::arg("hmm_path"),
py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix,
py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets)
.def("add_word",
[](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); });
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
*m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.")
.def(py::init<>());
.def(py::init<const bool &>(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets);
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp",
"Tensor operation to LookUp each word.")
.def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word) {
@ -632,21 +633,25 @@ void bindTokenizerOps(py::module *m) {
py::arg("separator"));
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
.def(
py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
}
void bindDependIcuTokenizerOps(py::module *m) {
#ifdef ENABLE_ICU4C
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
.def(py::init<>());
.def(py::init<const bool &>(), py::arg("with_offsets") = WhitespaceTokenizerOp::kDefWithOffsets);
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
.def(py::init<>())
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
.def(py::init<const bool &, const bool &>(),
py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace,
py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets);
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
.def(py::init<>());
@ -660,24 +665,28 @@ void bindDependIcuTokenizerOps(py::module *m) {
py::arg("replace_all"));
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
.def(py::init<const std::string &, const std::string &, const bool &>(), py::arg("delim_pattern"),
py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets);
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets);
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
"Tokenizer used for Bert text process.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
NormalizeForm, bool>(),
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &,
const bool &, const NormalizeForm &, const bool &, const bool &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
#endif
}

@ -27,10 +27,12 @@
namespace mindspore {
namespace dataset {
const bool BasicTokenizerOp::kDefLowerCase = false;
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
const bool BasicTokenizerOp::kDefWithOffsets = false;
const char BasicTokenizerOp::kCommonPattern[] =
"[!-/]"
"|[:-@]"
@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] =
"|[\\x{2F800}-\\x{2FA1F}]";
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|";
const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"};
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
bool preserve_unused_token)
BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace,
const NormalizeForm &normalization_form, const bool &preserve_unused_token,
const bool &with_offsets)
: lower_case_(lower_case),
keep_whitespace_(keep_whitespace),
preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets),
case_fold_(std::make_unique<CaseFoldOp>()),
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
normalization_form_(normalization_form),
@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
delim_pattern = kUnusedPattern + delim_pattern;
}
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_);
}
Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text,
@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
return Status::OK();
}
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::shared_ptr<Tensor> cur_input;
@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
if (lower_case_) {
if (!preserve_unused_token_) {
// to lower case
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor));
} else {
// to lower case except words in kUnusedWords
RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input, &processed_tensor));
RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor));
}
cur_input = processed_tensor;
// strip accent characters
@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
} else {
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor));
}
// strip control characters
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
return regex_tokenizer_->Compute(processed_tensor, output);
return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output);
}
} // namespace dataset
} // namespace mindspore

@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp {
static const bool kDefKeepWhitespace;
static const NormalizeForm kDefNormalizationForm;
static const bool kDefPreserveUnusedToken;
explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
NormalizeForm normalization_form = kDefNormalizationForm,
bool preserve_unused_token = kDefPreserveUnusedToken);
static const bool kDefWithOffsets;
explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace,
const NormalizeForm &normalization_form = kDefNormalizationForm,
const bool &preserve_unused_token = kDefPreserveUnusedToken,
const bool &with_offsets = kDefWithOffsets);
~BasicTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
protected:
Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words,
@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp {
static const char kCommonPattern[];
static const char kUnusedPattern[];
static const std::unordered_set<std::string> kUnusedWords;
bool with_offsets_;
bool lower_case_;
bool keep_whitespace_;
NormalizeForm normalization_form_;

@ -16,9 +16,9 @@
#include "dataset/text/kernels/bert_tokenizer_op.h"
namespace mindspore {
namespace dataset {
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
std::shared_ptr<Tensor> basic_tensor;
Status BertTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
TensorRow basic_tensor;
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
return Status::OK();

@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp {
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
bool lower_case = BasicTokenizerOp::kDefLowerCase,
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
const bool &lower_case = BasicTokenizerOp::kDefLowerCase,
const bool &keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
const NormalizeForm &normalization_form = BasicTokenizerOp::kDefNormalizationForm,
const bool &preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken,
const bool &with_offsets = WordpieceTokenizerOp::kDefWithOffsets)
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets),
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token, with_offsets) {}
~BertTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
private:
WordpieceTokenizerOp wordpiece_tokenizer_;

@ -23,35 +23,63 @@
namespace mindspore {
namespace dataset {
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, JiebaMode mode)
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path) {
const bool JiebaTokenizerOp::kDefWithOffsets = false;
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, const JiebaMode &mode,
const bool &with_offsets)
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path), with_offsets_(with_offsets) {
jieba_parser_ = std::make_unique<cppjieba::Jieba>(mp_dict_path_, hmm_model_path_, "");
}
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
Status JiebaTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
}
std::string_view sentence_v;
RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {}));
RETURN_IF_NOT_OK(input[0]->GetItemAt(&sentence_v, {}));
std::string sentence{sentence_v};
std::vector<std::string> words;
std::vector<uint32_t> offsets_start, offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
if (sentence == "") {
words.push_back("");
} else {
std::vector<cppjieba::Word> tmp;
if (jieba_mode_ == JiebaMode::kMp) {
jieba_parser_->CutSmall(sentence, words, MAX_WORD_LENGTH);
std::unique_ptr<cppjieba::MPSegment> mp_seg = std::make_unique<cppjieba::MPSegment>(jieba_parser_->GetDictTrie());
mp_seg->Cut(sentence, tmp, MAX_WORD_LENGTH);
} else if (jieba_mode_ == JiebaMode::kHmm) {
jieba_parser_->CutHMM(sentence, words);
std::unique_ptr<cppjieba::HMMSegment> hmm_seg =
std::make_unique<cppjieba::HMMSegment>(jieba_parser_->GetHMMModel());
hmm_seg->Cut(sentence, tmp);
} else { // Mix
jieba_parser_->Cut(sentence, words, true);
std::unique_ptr<cppjieba::MixSegment> mix_seg =
std::make_unique<cppjieba::MixSegment>(jieba_parser_->GetDictTrie(), jieba_parser_->GetHMMModel());
mix_seg->Cut(sentence, tmp, true);
}
GetStringsFromWords(tmp, words);
for (auto item : tmp) {
offsets_start.push_back(static_cast<uint32_t>(item.offset));
offsets_limit.push_back(static_cast<uint32_t>(item.offset + item.word.length()));
}
}
*output = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()}));
token_tensor = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
return Status::OK();
}

@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
class JiebaTokenizerOp : public TensorOp {
public:
// deffault constant for Jieba MPSegment algorithm.
// default constant for Jieba MPSegment algorithm.
static constexpr size_t MAX_WORD_LENGTH = 512;
// default const for set whether Jieba output offsets tensor.
static const bool kDefWithOffsets;
// Constructor for JiebaTokenizerOp.
// @param hmm_path HMM model file.
// @param mp_path MP model file.
// @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will
// tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and
// HMMSegment algorithm.
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, JiebaMode mode = JiebaMode::kMix);
// @with_offsets user set this value to choose whether output offset tensor.
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix,
const bool &with_offsets = kDefWithOffsets);
~JiebaTokenizerOp() override = default;
void Print(std::ostream &out) const override {
@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp {
<< mp_dict_path_;
}
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
// @word the word to be added to the JiebaTokenizer.
// @freq [Default 0] the frequency fo the word to be added.
@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp {
std::string mp_dict_path_;
std::unique_ptr<cppjieba::Jieba> jieba_parser_;
JiebaMode jieba_mode_;
bool with_offsets_;
};
} // namespace dataset
} // namespace mindspore

@ -22,8 +22,11 @@
namespace mindspore {
namespace dataset {
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode) const {
const bool RegexTokenizerOp::kDefWithOffsets = false;
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len,
std::string *out_utf8, icu::UnicodeString *out_unicode) const {
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
int total_len = input.length();
int end = start + len;
@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s
return Status::OK();
}
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
UErrorCode status = U_ZERO_ERROR;
out_tokens->clear();
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
token_matcher.reset(utext);
int text_start_index = 0;
int token_start_index = 0;
status = U_ZERO_ERROR;
while (token_matcher.find(status) && U_SUCCESS(status)) {
@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
int token_len = deli_start_index - token_start_index;
if (token_len > 0) {
std::string token;
uint32_t token_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
token_offset = token.length();
out_tokens->emplace_back(std::move(token));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + token_offset));
text_start_index += token_offset;
}
int delim_len = deli_end_index - deli_start_index;
if (keep_delim_ && delim_len > 0) {
if (delim_len > 0) {
icu::UnicodeString delim_str;
std::string delim_utf8_str;
uint32_t delim_str_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
delim_matcher.reset(delim_str);
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
delim_str_offset = delim_utf8_str.length();
if (keep_delim_ && delim_matcher.matches(status) && U_SUCCESS(status)) {
out_tokens->emplace_back(std::move(delim_utf8_str));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + delim_str_offset));
}
text_start_index += delim_str_offset;
}
token_start_index = deli_end_index;
}
if (token_start_index < utext.length()) {
std::string temp;
uint32_t temp_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
temp_offset = temp.length();
out_tokens->emplace_back(std::move(temp));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + temp_offset));
}
return Status::OK();
}
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
Status RegexTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view text;
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
std::vector<std::string> tokens;
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
std::vector<uint32_t> offsets_start;
std::vector<uint32_t> offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
RETURN_IF_NOT_OK(input[0]->GetItemAt(&text, {}));
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens, &offsets_start, &offsets_limit));
token_tensor = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
return Status::OK();
}
} // namespace dataset

@ -32,25 +32,31 @@ namespace dataset {
class RegexTokenizerOp : public TensorOp {
public:
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
static const bool kDefWithOffsets;
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern,
const bool &with_offsets = kDefWithOffsets)
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
with_offsets_(with_offsets),
keep_delim_(!keep_delim_pattern.empty()) {}
~RegexTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
protected:
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
Status GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, std::string *out_utf8,
icu::UnicodeString *out_unicode = nullptr) const;
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
private:
const icu::UnicodeString delim_pattern_;
const icu::UnicodeString keep_delim_pattern_;
bool with_offsets_;
const bool keep_delim_;
};
} // namespace dataset

@ -27,26 +27,46 @@ using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
const bool UnicodeCharTokenizerOp::kDefWithOffsets = false;
Status UnicodeCharTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
std::vector<std::string> splits(runes.size());
std::vector<uint32_t> offsets_start, offsets_limit;
for (size_t i = 0; i < runes.size(); i++) {
offsets_start.push_back(runes[i].offset);
offsets_limit.push_back(runes[i].offset + runes[i].len);
splits[i] = str.substr(runes[i].offset, runes[i].len);
}
if (splits.empty()) {
splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset

@ -26,13 +26,18 @@ namespace dataset {
class UnicodeCharTokenizerOp : public TensorOp {
public:
UnicodeCharTokenizerOp() {}
static const bool kDefWithOffsets;
explicit UnicodeCharTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
~UnicodeCharTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
private:
bool with_offsets_;
};
} // namespace dataset

@ -32,24 +32,28 @@ namespace mindspore {
namespace dataset {
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
const bool UnicodeScriptTokenizerOp::kDefWithOffsets = false;
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
Status UnicodeScriptTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
UScriptCode last_script = USCRIPT_INVALID_CODE;
icu::ErrorCode status;
int start = 0;
int len = 0;
std::vector<std::string> splits;
std::vector<uint32_t> offsets_start, offsets_limit;
bool was_space = false;
for (size_t i = 0; i < runes.size(); i++) {
@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
if (len > 0 && (script != last_script || is_space != was_space)) {
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
if (keep_whitespace_ || !was_space) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
}
if (len > 0 && (keep_whitespace_ || !was_space)) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
// 4) If the input is empty scalar string, the output will be 1-D empty string.
if (splits.empty()) {
splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset

@ -27,17 +27,21 @@ namespace dataset {
class UnicodeScriptTokenizerOp : public TensorOp {
public:
static const bool kDefKeepWhitespace;
static const bool kDefWithOffsets;
explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
explicit UnicodeScriptTokenizerOp(const bool &keep_whitespace = kDefKeepWhitespace,
const bool &with_offsets = kDefWithOffsets)
: keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
~UnicodeScriptTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
private:
bool keep_whitespace_; // If or not keep whitespace tokens
bool with_offsets_;
};
} // namespace dataset
} // namespace mindspore

@ -30,24 +30,33 @@ using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
const bool WhitespaceTokenizerOp::kDefWithOffsets = false;
Status WhitespaceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
std::vector<uint32_t> offsets_start, offsets_limit;
std::vector<std::string> splits;
int start = 0;
int len = 0;
for (size_t i = 0; i < runes.size(); i++) {
if (u_isUWhiteSpace(runes[i].rune)) {
if (len > 0) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
len = 0;
@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:
}
}
if (len > 0) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
if (splits.empty()) {
splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset

@ -26,13 +26,18 @@ namespace dataset {
class WhitespaceTokenizerOp : public TensorOp {
public:
WhitespaceTokenizerOp() {}
static const bool kDefWithOffsets;
explicit WhitespaceTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
~WhitespaceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
private:
bool with_offsets_;
};
} // namespace dataset
} // namespace mindspore

@ -24,13 +24,16 @@ namespace dataset {
const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
const bool WordpieceTokenizerOp::kDefWithOffsets = false;
WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
const int &max_bytes_per_token, const std::string &unknown_token)
const int &max_bytes_per_token, const std::string &unknown_token,
const bool &with_offsets)
: vocab_(vocab),
suffix_indicator_(suffix_indicator),
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token) {}
unknown_token_(unknown_token),
with_offsets_(with_offsets) {}
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
bool *out_found, int *out_end) const {
@ -51,17 +54,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
return Status::OK();
}
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, const uint32_t &basic_start,
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
out_tokens->clear();
offsets_start->push_back(basic_start);
if (unknown_token_.empty()) {
out_tokens->emplace_back(input_token);
offsets_limit->push_back(basic_start + input_token.length());
} else {
out_tokens->emplace_back(unknown_token_);
offsets_limit->push_back(basic_start + input_token.length());
}
return Status::OK();
}
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int &start, const int &end,
std::vector<std::string> *out_tokens) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
std::string subword = input_token.substr(start, end - start);
@ -72,9 +80,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in
return Status::OK();
}
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, const uint32_t &basic_start,
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
if (input_token.size() > max_bytes_per_token_) {
return FoundNoToken(input_token, out_tokens);
offsets_start->push_back(basic_start);
if (!unknown_token_.empty()) {
offsets_limit->push_back(basic_start + unknown_token_.size());
out_tokens->emplace_back(unknown_token_);
} else {
out_tokens->emplace_back(input_token);
offsets_limit->push_back(basic_start + input_token.size());
}
return Status::OK();
}
RuneStrArray runes;
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
@ -86,29 +104,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
if (found) {
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
offsets_start->push_back(static_cast<uint32_t>(basic_start + start));
offsets_limit->push_back(static_cast<uint32_t>(basic_start + end));
start = end;
} else {
return FoundNoToken(input_token, out_tokens);
return FoundNoToken(input_token, basic_start, out_tokens, offsets_start, offsets_limit);
}
}
return Status::OK();
}
Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
Status WordpieceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
if (input[0]->Rank() > 1 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
}
dsize_t count = 0;
std::vector<std::string> out_tokens;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
std::vector<uint32_t> offsets_start, offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
for (auto iter = input[0]->begin<std::string_view>(); iter != input[0]->end<std::string_view>(); iter++) {
uint32_t basic_start = 0;
std::vector<std::string> temp_tokens;
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
if (with_offsets_ && input.size() == 3) {
RETURN_IF_NOT_OK(input[1]->GetItemAt<uint32_t>(&basic_start, {count, 0}));
}
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), basic_start, &temp_tokens, &offsets_start, &offsets_limit));
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
count++;
}
if (out_tokens.empty()) {
out_tokens.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
return Status::OK();
}

@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp {
static const char kDefSuffixIndicator[];
static const int kDefMaxBytesPerToken;
static const char kDefUnknownToken[];
static const bool kDefWithOffsets;
WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
const int &max_bytes_per_token = kDefMaxBytesPerToken,
const std::string &unknown_token = kDefUnknownToken);
const std::string &unknown_token = kDefUnknownToken, const bool &with_offsets = kDefWithOffsets);
~WordpieceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
Status Compute(const TensorRow &input, TensorRow *output) override;
protected:
Status AddSubword(const std::string &input_token, const int start, const int end,
Status AddSubword(const std::string &input_token, const int &start, const int &end,
std::vector<std::string> *out_token) const;
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
Status FoundNoToken(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
int *out_end) const;
Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
Status GetTokens(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
private:
const std::shared_ptr<Vocab> vocab_;
const std::string suffix_indicator_;
const bool with_offsets_;
const int max_bytes_per_token_;
const std::string unknown_token_;
};

File diff suppressed because it is too large Load Diff

@ -25,7 +25,6 @@ from mindspore._c_expression import typing
from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \
INT32_MAX, check_value
def check_unique_list_of_words(words, arg_name):
"""Check that words is a list and each element is a str without any duplication"""
@ -116,11 +115,22 @@ def check_from_dict(method):
def check_jieba_init(method):
"""Wrapper method to check the parameters of jieba add word."""
"""Wrapper method to check the parameters of jieba init."""
@wraps(method)
def new_method(self, *args, **kwargs):
parse_user_args(method, *args, **kwargs)
[hmm_path, mp_path, _, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if hmm_path is None:
raise ValueError("The dict of HMMSegment in cppjieba is not provided.")
if not isinstance(hmm_path, str):
raise TypeError("Wrong input type for hmm_path, should be string.")
if mp_path is None:
raise ValueError("The dict of MPSegment in cppjieba is not provided.")
if not isinstance(mp_path, str):
raise TypeError("Wrong input type for mp_path, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
@ -152,6 +162,128 @@ def check_jieba_add_dict(method):
return new_method
def check_with_offsets(method):
"""Wrapper method to check if with_offsets is the only one parameter."""
@wraps(method)
def new_method(self, *args, **kwargs):
[with_offsets], _ = parse_user_args(method, *args, **kwargs)
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_unicode_script_tokenizer(method):
"""Wrapper method to check the parameter of UnicodeScriptTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[keep_whitespace, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_wordpiece_tokenizer(method):
"""Wrapper method to check the parameter of WordpieceTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets], _ =\
parse_user_args(method, *args, **kwargs)
if vocab is None:
raise ValueError("vocab is not provided.")
if not isinstance(vocab, cde.Vocab):
raise TypeError("Wrong input type for vocab, should be Vocab object.")
if not isinstance(suffix_indicator, str):
raise TypeError("Wrong input type for suffix_indicator, should be string.")
if not isinstance(unknown_token, str):
raise TypeError("Wrong input type for unknown_token, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
check_uint32(max_bytes_per_token)
return method(self, *args, **kwargs)
return new_method
def check_regex_tokenizer(method):
"""Wrapper method to check the parameter of RegexTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[delim_pattern, keep_delim_pattern, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if delim_pattern is None:
raise ValueError("delim_pattern is not provided.")
if not isinstance(delim_pattern, str):
raise TypeError("Wrong input type for delim_pattern, should be string.")
if not isinstance(keep_delim_pattern, str):
raise TypeError("Wrong input type for keep_delim_pattern, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_basic_tokenizer(method):
"""Wrapper method to check the parameter of RegexTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[lower_case, keep_whitespace, _, preserve_unused, with_offsets], _ =\
parse_user_args(method, *args, **kwargs)
if not isinstance(lower_case, bool):
raise TypeError("Wrong input type for lower_case, should be boolean.")
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(preserve_unused, bool):
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_bert_tokenizer(method):
"""Wrapper method to check the parameter of BertTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, _,
preserve_unused_token, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if vocab is None:
raise ValueError("vacab is not provided.")
if not isinstance(vocab, cde.Vocab):
raise TypeError("Wrong input type for vocab, should be Vocab object.")
if not isinstance(suffix_indicator, str):
raise TypeError("Wrong input type for suffix_indicator, should be string.")
if not isinstance(max_bytes_per_token, int):
raise TypeError("Wrong input type for max_bytes_per_token, should be int.")
check_uint32(max_bytes_per_token)
if not isinstance(unknown_token, str):
raise TypeError("Wrong input type for unknown_token, should be string.")
if not isinstance(lower_case, bool):
raise TypeError("Wrong input type for lower_case, should be boolean.")
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(preserve_unused_token, bool):
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_from_dataset(method):
"""A wrapper that wrap a parameter checker to the original function."""

@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) {
std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor;
TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("今天天气太好了我们一起去外面玩吧");
Status s = op->Compute(input_tensor, &output_tensor);
input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 7);
CheckEqual(output_tensor, {0}, "今天天气");
CheckEqual(output_tensor, {1}, "太好了");
CheckEqual(output_tensor, {2}, "我们");
CheckEqual(output_tensor, {3}, "一起");
CheckEqual(output_tensor, {4}, "");
CheckEqual(output_tensor, {5}, "外面");
CheckEqual(output_tensor, {6}, "玩吧");
EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output[0]->Size(), 7);
CheckEqual(output[0], {0}, "今天天气");
CheckEqual(output[0], {1}, "太好了");
CheckEqual(output[0], {2}, "我们");
CheckEqual(output[0], {3}, "一起");
CheckEqual(output[0], {4}, "");
CheckEqual(output[0], {5}, "外面");
CheckEqual(output[0], {6}, "玩吧");
}
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor;
TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
op->AddWord("男默女泪");
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("男默女泪");
Status s = op->Compute(input_tensor, &output_tensor);
input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 1);
CheckEqual(output_tensor, {0}, "男默女泪");
EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output[0]->Size(), 1);
CheckEqual(output[0], {0}, "男默女泪");
}
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor;
TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
op->AddWord("男默女泪");
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("");
Status s = op->Compute(input_tensor, &output_tensor);
input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 1);
CheckEqual(output_tensor, {0}, "");
EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output[0]->Size(), 1);
CheckEqual(output[0], {0}, "");
}

File diff suppressed because it is too large Load Diff

@ -1,83 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing BasicTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
test_paras = [
dict(
first=1,
last=6,
expected_tokens=
[['Welcome', 'to', 'Beijing', '', '', '', '', ''],
['', '', '', '', '', '', '', '', '', '', '', '', '', '', ''],
['😀', '', '', '😃', '', '', '😄', '', '', '😁', '', ''],
['', '', '', '1368', '', '1644', '', '', '', '', '',
'', '1644', '', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', ''],
['', '', '', '1368', '-', '1644', '', '', '', '',
'', '1644', '-', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', 'における', '', '', 'の2つの', '', '', 'でした'],
['명나라', '(', '1368', '-', '1644', ')', '', '청나라', '(', '1644', '-', '1911', ')', '',
'중국', '봉건', '왕조의', '역사에서', '마지막', '', '왕조였다']]
),
dict(
first=7,
last=7,
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
lower_case=True
),
]
def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token)
dataset = dataset.map(operations=basic_tokenizer)
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text'])
logger.info("Out:", text)
logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(text, expected_tokens[count])
count = count + 1
def test_basic_tokenizer():
"""
Test BasicTokenizer
"""
for paras in test_paras:
check_basic_tokenizer(**paras)
if __name__ == '__main__':
test_basic_tokenizer()

@ -1,238 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import mindspore.dataset as ds
from mindspore.dataset.text import JiebaTokenizer
from mindspore.dataset.text import JiebaMode, to_str
DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
def test_jieba_1():
"""Test jieba tokenizer with MP mode"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '', '外面', '玩吧']
ret = []
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_1_1():
"""Test jieba tokenizer with HMM mode"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天', '天气', '', '', '', '我们', '一起', '', '外面', '', '']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_1_2():
"""Test jieba tokenizer with HMM MIX"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '', '外面', '玩吧']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_2():
"""Test add_word"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_word("男默女泪")
expect = ['男默女泪', '', '长江大桥']
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=2)
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_2_1():
"""Test add_word with freq"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_word("男默女泪", 10)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=2)
expect = ['男默女泪', '', '长江大桥']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_2_2():
"""Test add_word with invalid None Input"""
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
try:
jieba_op.add_word(None)
except ValueError:
pass
def test_jieba_2_3():
"""Test add_word with freq, the value of freq affects the result of segmentation"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_word("江大桥", 20000)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=2)
expect = ['江州', '市长', '江大桥', '参加', '', '长江大桥', '', '通车', '仪式']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_3():
"""Test add_dict with dict"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
user_dict = {
"男默女泪": 10
}
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_dict(user_dict)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['男默女泪', '', '长江大桥']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_3_1():
"""Test add_dict with dict"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
user_dict = {
"男默女泪": 10,
"江大桥": 20000
}
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_dict(user_dict)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['男默女泪', '市长', '江大桥']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_4():
DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_dict(DICT_FILE)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '', '外面', '玩吧']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def test_jieba_4_1():
"""Test add dict with invalid file path"""
DICT_FILE = ""
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
try:
jieba_op.add_dict(DICT_FILE)
except ValueError:
pass
def test_jieba_5():
"""Test add dict with file path"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
jieba_op.add_word("江大桥", 20000)
data = data.map(input_columns=["text"],
operations=jieba_op, num_parallel_workers=1)
expect = ['江州', '市长', '江大桥', '参加', '', '长江大桥', '', '通车', '仪式']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
def gen():
text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
yield (text,)
def pytoken_op(input_data):
te = str(to_str(input_data))
tokens = []
tokens.append(te[:5].encode("UTF8"))
tokens.append(te[5:10].encode("UTF8"))
tokens.append(te[10:].encode("UTF8"))
return np.array(tokens, dtype='S')
def test_jieba_6():
data = ds.GeneratorDataset(gen, column_names=["text"])
data = data.map(input_columns=["text"],
operations=pytoken_op, num_parallel_workers=1)
expect = ['今天天气太', '好了我们一', '起去外面玩吧']
for i in data.create_dict_iterator():
ret = to_str(i["text"])
for index, item in enumerate(ret):
assert item == expect[index]
if __name__ == "__main__":
test_jieba_1()
test_jieba_1_1()
test_jieba_1_2()
test_jieba_2()
test_jieba_2_1()
test_jieba_2_2()
test_jieba_3()
test_jieba_3_1()
test_jieba_4()
test_jieba_4_1()
test_jieba_5()
test_jieba_5()
test_jieba_6()

@ -0,0 +1,138 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing BasicTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as text
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
test_paras = [
dict(
first=1,
last=6,
expected_tokens=
[['Welcome', 'to', 'Beijing', '', '', '', '', ''],
['', '', '', '', '', '', '', '', '', '', '', '', '', '', ''],
['😀', '', '', '😃', '', '', '😄', '', '', '😁', '', ''],
['', '', '', '1368', '', '1644', '', '', '', '', '',
'', '1644', '', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', ''],
['', '', '', '1368', '-', '1644', '', '', '', '',
'', '1644', '-', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', 'における', '', '', 'の2つの', '', '', 'でした'],
['명나라', '(', '1368', '-', '1644', ')', '', '청나라', '(', '1644', '-', '1911', ')', '',
'중국', '봉건', '왕조의', '역사에서', '마지막', '', '왕조였다']],
expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30],
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42],
[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37],
[0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49,
52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100],
[0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51,
54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115],
[0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]],
expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33],
[3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45],
[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40],
[3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58,
61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103],
[3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54,
57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124],
[9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]]
),
dict(
first=7,
last=7,
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
expected_offsets_start=[[0, 5, 8, 10, 16]],
expected_offsets_limit=[[4, 7, 9, 15, 22]],
lower_case=True
),
]
def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token)
dataset = dataset.map(operations=basic_tokenizer)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['text'])
logger.info("Out:", token)
logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(token, expected_tokens[count])
count = count + 1
def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token,
with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=basic_tokenizer)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token'])
logger.info("Out:", token)
logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(token, expected_tokens[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count = count + 1
def test_basic_tokenizer_with_offsets():
"""
Test BasicTokenizer
"""
for paras in test_paras:
check_basic_tokenizer_with_offsets(**paras)
def test_basic_tokenizer_default():
"""
Test BasicTokenizer
"""
for paras in test_paras:
check_basic_tokenizer_default(**paras)
if __name__ == '__main__':
test_basic_tokenizer_default()
test_basic_tokenizer_with_offsets()

@ -18,7 +18,7 @@ Testing BertTokenizer op in DE
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
import mindspore.dataset.text as text
BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
@ -39,6 +39,14 @@ test_paras = [
['', '', '', '', ''],
['', '', '', '', ''],
['', '', '', '', '']],
expected_offsets_start=[[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12]],
expected_offsets_limit=[[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15]],
vocab_list=vocab_bert
),
# test english text
@ -46,6 +54,8 @@ test_paras = [
first=5,
last=5,
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
lower_case=True,
vocab_list=vocab_bert
),
@ -53,6 +63,8 @@ test_paras = [
first=5,
last=5,
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
lower_case=False,
vocab_list=vocab_bert
),
@ -63,7 +75,9 @@ test_paras = [
expect_str=[
['😀', '', '', '😃', '', '', '😄', '', '', '😁', '', ''],
['', '', '']],
normalization_form=nlp.utils.NormalizeForm.NFKC,
expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]],
expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]],
normalization_form=text.utils.NormalizeForm.NFKC,
vocab_list=vocab_bert
),
# test preserved tokens
@ -79,6 +93,8 @@ test_paras = [
['[unused1]'],
['[unused10]']
],
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -95,6 +111,8 @@ test_paras = [
['[unused1]'],
['[unused10]']
],
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
lower_case=True,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -104,6 +122,8 @@ test_paras = [
first=15,
last=15,
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]],
expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]],
preserve_unused_token=True,
vocab_list=vocab_bert
),
@ -112,6 +132,8 @@ test_paras = [
first=8,
last=8,
expect_str=[['[UNK]', ' ', '[CLS]']],
expected_offsets_start=[[0, 6, 7]],
expected_offsets_limit=[[6, 7, 12]],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -121,6 +143,8 @@ test_paras = [
first=8,
last=8,
expect_str=[['unused', ' ', '[CLS]']],
expected_offsets_start=[[0, 6, 7]],
expected_offsets_limit=[[6, 7, 12]],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -131,6 +155,8 @@ test_paras = [
first=8,
last=8,
expect_str=[['unused', ' ', '[', 'CLS', ']']],
expected_offsets_start=[[0, 6, 7, 8, 11]],
expected_offsets_limit=[[6, 7, 8, 11, 12]],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=False,
@ -140,20 +166,20 @@ test_paras = [
]
def check_bert_tokenizer(first, last, expect_str,
vocab_list,
suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False,
normalization_form=nlp.utils.NormalizeForm.NONE,
preserve_unused_token=False):
def check_bert_tokenizer_default(first, last, expect_str,
expected_offsets_start, expected_offsets_limit,
vocab_list, suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE,
preserve_unused_token=False):
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = nlp.Vocab.from_list(vocab_list)
tokenizer_op = nlp.BertTokenizer(
vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = text.BertTokenizer(
vocab=vocab, suffix_indicator=suffix_indicator,
max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
lower_case=lower_case, keep_whitespace=keep_whitespace,
@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str,
dataset = dataset.map(operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text'])
logger.info("Out:", text)
token = text.to_str(i['text'])
logger.info("Out:", token)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(text, expect_str[count])
np.testing.assert_array_equal(token, expect_str[count])
count = count + 1
def test_bert_tokenizer():
def check_bert_tokenizer_with_offsets(first, last, expect_str,
expected_offsets_start, expected_offsets_limit,
vocab_list, suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE,
preserve_unused_token=False):
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = text.BertTokenizer(
vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token,
unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace,
normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token'])
logger.info("Out:", token)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(token, expect_str[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count = count + 1
def test_bert_tokenizer_default():
"""
Test WordpieceTokenizer when with_offsets=False
"""
for paras in test_paras:
check_bert_tokenizer_default(**paras)
def test_bert_tokenizer_with_offsets():
"""
Test WordpieceTokenizer
Test WordpieceTokenizer when with_offsets=True
"""
for paras in test_paras:
check_bert_tokenizer(**paras)
check_bert_tokenizer_with_offsets(**paras)
if __name__ == '__main__':
test_bert_tokenizer()
test_bert_tokenizer_default()
test_bert_tokenizer_with_offsets()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save