/** * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "common/common.h" #include "dataset/text/kernels/basic_tokenizer_op.h" #include "dataset/text/kernels/case_fold_op.h" #include "dataset/text/kernels/normalize_utf8_op.h" #include "dataset/text/kernels/regex_replace_op.h" #include "dataset/text/kernels/regex_tokenizer_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h" #include "dataset/text/kernels/unicode_script_tokenizer_op.h" #include "dataset/text/kernels/whitespace_tokenizer_op.h" #include "gtest/gtest.h" #include "utils/log_adapter.h" using namespace mindspore::dataset; class MindDataTestTokenizerOp : public UT::Common { public: void CheckEqual(const std::shared_ptr &o, const std::vector &index, const std::string &expect) { std::string_view str; Status s = o->GetItemAt(&str, index); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(str, expect); } }; TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp."; std::unique_ptr op(new UnicodeCharTokenizerOp(true)); std::shared_ptr input = std::make_shared("Hello World!"); TensorRow output; Status s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 12); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); CheckEqual(output[0], {0}, "H"); CheckEqual(output[0], {1}, "e"); CheckEqual(output[0], {2}, "l"); CheckEqual(output[0], {3}, "l"); CheckEqual(output[0], {4}, "o"); CheckEqual(output[0], {5}, " "); CheckEqual(output[0], {6}, "W"); CheckEqual(output[0], {7}, "o"); CheckEqual(output[0], {8}, "r"); CheckEqual(output[0], {9}, "l"); CheckEqual(output[0], {10}, "d"); CheckEqual(output[0], {11}, "!"); input = std::make_shared("中国 你好!"); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 6); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); CheckEqual(output[0], {0}, "中"); CheckEqual(output[0], {1}, "国"); CheckEqual(output[0], {2}, " "); CheckEqual(output[0], {3}, "你"); CheckEqual(output[0], {4}, "好"); CheckEqual(output[0], {5}, "!"); input = std::make_shared("中"); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); CheckEqual(output[0], {0}, "中"); input = std::make_shared("H"); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); CheckEqual(output[0], {0}, "H"); input = std::make_shared(" "); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 2); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); CheckEqual(output[0], {0}, " "); CheckEqual(output[0], {1}, " "); input = std::make_shared(""); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; std::unique_ptr op(new WhitespaceTokenizerOp(true)); std::shared_ptr input = std::make_shared("Welcome to China."); TensorRow output; Status s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 3); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Welcome"); CheckEqual(output[0], {1}, "to"); CheckEqual(output[0], {2}, "China."); input = std::make_shared(" hello"); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); CheckEqual(output[0], {0}, "hello"); input = std::make_shared("hello"); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); CheckEqual(output[0], {0}, "hello"); input = std::make_shared("hello "); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); CheckEqual(output[0], {0}, "hello"); input = std::make_shared(" "); output.clear(); s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; std::unique_ptr keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true)); std::unique_ptr skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true)); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); TensorRow output; Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 10); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Welcome"); CheckEqual(output[0], {1}, " "); CheckEqual(output[0], {2}, "to"); CheckEqual(output[0], {3}, " "); CheckEqual(output[0], {4}, "China"); CheckEqual(output[0], {5}, "."); CheckEqual(output[0], {6}, " \n "); CheckEqual(output[0], {7}, "中国"); CheckEqual(output[0], {8}, "\t"); CheckEqual(output[0], {9}, "北京"); output.clear(); s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 6); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Welcome"); CheckEqual(output[0], {1}, "to"); CheckEqual(output[0], {2}, "China"); CheckEqual(output[0], {3}, "."); CheckEqual(output[0], {4}, "中国"); CheckEqual(output[0], {5}, "北京"); input = std::make_shared(" Welcome to 中国. "); output.clear(); s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 4); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Welcome"); CheckEqual(output[0], {1}, "to"); CheckEqual(output[0], {2}, "中国"); CheckEqual(output[0], {3}, "."); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 8); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); CheckEqual(output[0], {0}, " "); CheckEqual(output[0], {1}, "Welcome"); CheckEqual(output[0], {2}, " "); CheckEqual(output[0], {3}, "to"); CheckEqual(output[0], {4}, " "); CheckEqual(output[0], {5}, "中国"); CheckEqual(output[0], {6}, "."); CheckEqual(output[0], {7}, " "); input = std::make_shared("Hello"); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Hello"); input = std::make_shared("H"); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); CheckEqual(output[0], {0}, "H"); input = std::make_shared(""); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString(); CheckEqual(output[0], {0}, ""); input = std::make_shared("Hello中国Hello世界"); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 4); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString(); CheckEqual(output[0], {0}, "Hello"); CheckEqual(output[0], {1}, "中国"); CheckEqual(output[0], {2}, "Hello"); CheckEqual(output[0], {3}, "世界"); input = std::make_shared(" "); output.clear(); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString(); CheckEqual(output[0], {0}, " "); input = std::make_shared(" "); output.clear(); s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 1); EXPECT_EQ(output[0]->Rank(), 1); MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString(); CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestCaseFold) { MS_LOG(INFO) << "Doing TestCaseFold."; std::unique_ptr case_fold_op(new CaseFoldOp()); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); std::shared_ptr output; Status s = case_fold_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output->Rank(), 0); MS_LOG(INFO) << "Out tensor1: " << output->ToString(); CheckEqual(output, {}, "welcome to china. \n 中国\t北京"); } TEST_F(MindDataTestTokenizerOp, TestNormalize) { MS_LOG(INFO) << "Doing TestNormalize."; std::unique_ptr nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc)); std::unique_ptr nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc)); std::unique_ptr nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd)); std::unique_ptr nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd)); std::shared_ptr input = std::make_shared("ṩ"); std::shared_ptr output; Status s = nfc_normalize_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); MS_LOG(INFO) << "NFC str:" << output->ToString(); nfkc_normalize_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); MS_LOG(INFO) << "NFKC str:" << output->ToString(); nfd_normalize_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); MS_LOG(INFO) << "NFD str:" << output->ToString(); nfkd_normalize_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); MS_LOG(INFO) << "NFKD str:" << output->ToString(); } TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { MS_LOG(INFO) << "Doing TestRegexReplace."; std::unique_ptr regex_replace_op(new RegexReplaceOp("\\s+", "_", true)); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); std::shared_ptr output; Status s = regex_replace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output->Rank(), 0); MS_LOG(INFO) << "Out tensor1: " << output->ToString(); CheckEqual(output, {}, "Welcome_to_China._中国_北京"); } TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; std::unique_ptr regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true)); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); TensorRow output; Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); } TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { MS_LOG(INFO) << "Doing TestBasicTokenizer."; //bool lower_case, bool keep_whitespace, // NormalizeForm normalization_form, bool preserve_unused_token std::unique_ptr basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false, true)); std::shared_ptr input = std::make_shared("Welcome to China. 中国\t北京"); TensorRow output; Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); }