parent
b6b254f6e4
commit
52d278e858
@ -0,0 +1,82 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<Dataset> child,
|
||||
std::shared_ptr<SentencePieceVocab> vocab,
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size,
|
||||
float character_coverage, SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms)
|
||||
: vocab_(vocab),
|
||||
col_names_(col_names),
|
||||
vocab_size_(vocab_size),
|
||||
character_coverage_(character_coverage),
|
||||
model_type_(model_type),
|
||||
params_(params) {
|
||||
this->children.push_back(child);
|
||||
}
|
||||
|
||||
// Function to build BuildSentenceVocabNode
|
||||
std::vector<std::shared_ptr<DatasetOp>> BuildSentenceVocabNode::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
std::shared_ptr<BuildSentencePieceVocabOp> build_sentence_piece_vocab_op;
|
||||
build_sentence_piece_vocab_op = std::make_shared<BuildSentencePieceVocabOp>(
|
||||
vocab_, col_names_, vocab_size_, character_coverage_, model_type_, params_, connector_que_size_);
|
||||
node_ops.push_back(build_sentence_piece_vocab_op);
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
Status BuildSentenceVocabNode::ValidateParams() {
|
||||
if (vocab_ == nullptr) {
|
||||
std::string err_msg = "BuildSentenceVocabNode: vocab is null.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (vocab_size_ <= 0) {
|
||||
std::string err_msg =
|
||||
"BuildSentenceVocabNode: vocab_size should be positive, but got: " + std::to_string(vocab_size_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (character_coverage_ < 0.98f || character_coverage_ > 1.0f) {
|
||||
std::string err_msg = "BuildSentenceVocabNode: character_coverage should to be between 0.98 and 1.0, but got " +
|
||||
std::to_string(character_coverage_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,62 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
class BuildSentenceVocabNode : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
BuildSentenceVocabNode(std::shared_ptr<Dataset> child, std::shared_ptr<SentencePieceVocab> vocab,
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
|
||||
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms);
|
||||
|
||||
/// \brief Destructor
|
||||
~BuildSentenceVocabNode() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return Status Status::OK() if all the parameters are valid
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||
std::vector<std::string> col_names_;
|
||||
uint32_t vocab_size_;
|
||||
float character_coverage_;
|
||||
SentencePieceModel model_type_;
|
||||
std::unordered_map<std::string, std::string> params_;
|
||||
};
|
||||
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
@ -0,0 +1,224 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
// IR non-leaf nodes
|
||||
#include "minddata/dataset/engine/ir/datasetops/map_node.h"
|
||||
|
||||
// IR leaf nodes
|
||||
#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::SentencePieceModel;
|
||||
using mindspore::dataset::SentencePieceVocab;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<SentencePieceVocab> vocab =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer =
|
||||
text::SentencePieceTokenizer(vocab, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Expected result after tokenization
|
||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
MS_LOG(INFO) << *txt;
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*txt, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<SentencePieceVocab> vocab =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Save vocab model to local
|
||||
vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model = datasets_root_path_ + "/test_sentencepiece/m.model";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer =
|
||||
text::SentencePieceTokenizer(vocab_model, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Expected result after tokenization
|
||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
MS_LOG(INFO) << *txt;
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*txt, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// vocab_size can not less than or equal to 0
|
||||
std::shared_ptr<SentencePieceVocab> vocab1 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 0, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab1, nullptr);
|
||||
|
||||
// character_coverage should to be between 0.98 and 1.0
|
||||
std::shared_ptr<SentencePieceVocab> vocab2 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 1, 0.979, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab2, nullptr);
|
||||
|
||||
// character_coverage should to be between 0.98 and 1.0
|
||||
std::shared_ptr<SentencePieceVocab> vocab3 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 1, 1.01, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab3, nullptr);
|
||||
|
||||
// column name does not exist
|
||||
std::shared_ptr<SentencePieceVocab> vocab4 =
|
||||
ds_vocab->BuildSentencePieceVocab({"image"}, 2, 0.98, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab4, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with incorrect parameter.";
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model1 = "";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer1 =
|
||||
text::SentencePieceTokenizer(vocab_model1, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer1, nullptr);
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model2 = "m.model";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer2 =
|
||||
text::SentencePieceTokenizer(vocab_model2, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer2, nullptr);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<SentencePieceVocab> vocab_model3 = nullptr;
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer3 =
|
||||
text::SentencePieceTokenizer(vocab_model3, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer3, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with invalid SentencePieceVocab object.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<SentencePieceVocab> vocab_model4 = std::make_shared<SentencePieceVocab>();
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer4 =
|
||||
text::SentencePieceTokenizer(vocab_model4, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer4, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer4}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
EXPECT_EQ(iter->GetNextRow(&row), false);
|
||||
}
|
Loading…
Reference in new issue