From f48ab2b5c9ee6a24a9ea40c585c718150f16d88f Mon Sep 17 00:00:00 2001 From: mohammad Date: Sat, 9 Jan 2021 17:09:26 -0500 Subject: [PATCH] make MD public headers standalone --- .../ccsrc/minddata/dataset/api/config.cc | 1 + .../ccsrc/minddata/dataset/api/datasets.cc | 19 +++++++++- .../ccsrc/minddata/dataset/api/iterator.cc | 4 +++ .../dataset/include/datasets_bindings.cc | 2 ++ .../python/bindings/dataset/text/bindings.cc | 1 + mindspore/ccsrc/minddata/dataset/api/text.cc | 36 ++++++++++++++----- .../ccsrc/minddata/dataset/core/constants.h | 3 ++ .../minddata/dataset/core/global_context.h | 2 +- .../dataset/engine/consumers/tree_consumer.cc | 4 ++- .../build_sentence_piece_vocab_node.h | 1 + .../engine/ir/datasetops/dataset_node.h | 15 +++++++- .../ir/datasetops/source/minddata_node.h | 1 + .../ir/datasetops/source/random_node.cc | 1 + .../engine/ir/datasetops/source/random_node.h | 2 ++ .../ccsrc/minddata/dataset/include/config.h | 2 -- .../minddata/dataset/include/constants.h | 3 ++ .../ccsrc/minddata/dataset/include/datasets.h | 31 +++++++++------- .../minddata/dataset/include/de_tensor.h | 8 ++--- .../ccsrc/minddata/dataset/include/execute.h | 9 ++--- .../ccsrc/minddata/dataset/include/iterator.h | 5 ++- .../ccsrc/minddata/dataset/include/samplers.h | 3 +- .../ccsrc/minddata/dataset/include/status.h | 22 ++++++++++++ .../ccsrc/minddata/dataset/include/text.h | 35 +++++++++--------- .../ccsrc/minddata/dataset/include/type_id.h | 1 - .../dataset/text/sentence_piece_vocab.h | 3 +- tests/ut/cpp/dataset/build_vocab_test.cc | 1 + tests/ut/cpp/dataset/c_api_cache_test.cc | 3 -- .../ut/cpp/dataset/c_api_dataset_ops_test.cc | 1 + tests/ut/cpp/dataset/c_api_samplers_test.cc | 1 + .../c_api_text_sentence_piece_vocab_test.cc | 5 +-- tests/ut/cpp/dataset/c_api_text_test.cc | 24 ++++++------- tests/ut/cpp/dataset/c_api_text_vocab_test.cc | 15 ++++---- tests/ut/cpp/dataset/ir_callback_test.cc | 1 + 33 files changed, 182 insertions(+), 83 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/config.cc b/mindspore/ccsrc/minddata/dataset/api/config.cc index aba7e87159..0f5bfe9217 100644 --- a/mindspore/ccsrc/minddata/dataset/api/config.cc +++ b/mindspore/ccsrc/minddata/dataset/api/config.cc @@ -17,6 +17,7 @@ #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/include/config.h" +#include "minddata/dataset/util/log_adapter.h" #include "minddata/dataset/util/status.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index e666442810..15300fb2a5 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -19,16 +19,33 @@ #include #include #include + +#include "minddata/dataset/engine/runtime_context.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/util/path.h" +#include "minddata/dataset/util/status.h" + +#include "minddata/dataset/core/client.h" +#include "minddata/dataset/engine/consumers/tree_consumer.h" + +#include "minddata/dataset/kernels/c_func_op.h" +#include "minddata/dataset/kernels/tensor_op.h" #ifndef ENABLE_ANDROID #include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h" #endif +#ifndef ENABLE_ANDROID +#include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/text/vocab.h" +#endif + // Sampler headers (in alphabetical order) #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" + // IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #ifndef ENABLE_ANDROID @@ -57,7 +74,6 @@ #endif #include "minddata/dataset/core/config_manager.h" -#include "minddata/dataset/util/path.h" #include "minddata/dataset/util/random.h" #include "minddata/dataset/util/services.h" @@ -939,6 +955,7 @@ TFRecordDataset::TFRecordDataset(const std::vector &dataset_files, shard_id, shard_equal_rows, cache); ir_node_ = std::static_pointer_cast(ds); } + #endif } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/iterator.cc b/mindspore/ccsrc/minddata/dataset/api/iterator.cc index e9fda03b2c..9daa7403ad 100644 --- a/mindspore/ccsrc/minddata/dataset/api/iterator.cc +++ b/mindspore/ccsrc/minddata/dataset/api/iterator.cc @@ -16,11 +16,15 @@ #include "minddata/dataset/include/iterator.h" #include "minddata/dataset/core/client.h" #include "minddata/dataset/engine/consumers/tree_consumer.h" +#include "minddata/dataset/engine/runtime_context.h" #include "minddata/dataset/include/datasets.h" namespace mindspore { namespace dataset { +Iterator::Iterator() : consumer_(nullptr) {} +Iterator::~Iterator() { Stop(); } + // Get the next row from the data pipeline. bool Iterator::GetNextRow(TensorMap *row) { Status rc = consumer_->GetNextAsMap(row); diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc index 852ca3b9c9..5a8578d069 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc @@ -23,6 +23,8 @@ #include "minddata/dataset/core/constants.h" #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/text/sentence_piece_vocab.h" + // IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc index 244b3402ab..d746fa298f 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc @@ -21,6 +21,7 @@ #include "minddata/dataset/api/python/pybind_register.h" #include "minddata/dataset/text/vocab.h" #include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/include/constants.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 60185bb22c..5981331007 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -39,6 +39,7 @@ #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" #endif +#include "minddata/dataset/core/data_type.h" #include "minddata/dataset/util/path.h" namespace mindspore { @@ -87,7 +88,7 @@ std::shared_ptr JiebaTokenizer(const std::string &hmm_p } std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token, - const DataType &data_type) { + const std::string &data_type) { auto op = std::make_shared(vocab, unknown_token, data_type); return op->ValidateParams() ? op : nullptr; @@ -142,7 +143,7 @@ std::shared_ptr SlidingWindow(const int32_t width, const return op->ValidateParams() ? op : nullptr; } -std::shared_ptr ToNumber(const DataType data_type) { +std::shared_ptr ToNumber(const std::string &data_type) { auto op = std::make_shared(data_type); return op->ValidateParams() ? op : nullptr; @@ -200,6 +201,19 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s return Status::OK(); } +// Helper functions to help validate data type passed by user +bool IsTypeNumeric(const std::string &data_type) { + if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || + data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || + data_type == "float16" || data_type == "float32" || data_type == "float64") + return true; + return false; +} + +bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } + +bool IsTypeString(const std::string &data_type) { return data_type == "string"; } + /* ####################################### Derived TensorOperation classes ################################# */ // (In alphabetical order) @@ -239,6 +253,8 @@ BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr &voc preserve_unused_token_(preserve_unused_token), with_offsets_(with_offsets) {} +BertTokenizerOperation::~BertTokenizerOperation() = default; + Status BertTokenizerOperation::ValidateParams() { if (vocab_ == nullptr) { std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; @@ -303,9 +319,11 @@ std::shared_ptr JiebaTokenizerOperation::Build() { // LookupOperation LookupOperation::LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token, - const DataType &data_type) + const std::string &data_type) : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} +LookupOperation::~LookupOperation() = default; + Status LookupOperation::ValidateParams() { if (vocab_ == nullptr) { std::string err_msg = "Lookup: vocab object type is incorrect or null."; @@ -320,7 +338,7 @@ Status LookupOperation::ValidateParams() { RETURN_STATUS_SYNTAX_ERROR(err_msg); } - if (!data_type_.IsNumeric()) { + if (!IsTypeNumeric(data_type_)) { std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); @@ -330,7 +348,7 @@ Status LookupOperation::ValidateParams() { } std::shared_ptr LookupOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(vocab_, default_id_, data_type_); + std::shared_ptr tensor_op = std::make_shared(vocab_, default_id_, DataType(data_type_)); return tensor_op; } @@ -419,6 +437,8 @@ std::shared_ptr RegexTokenizerOperation::Build() { #endif // SentencePieceTokenizerOperation +SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; + SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} @@ -482,11 +502,11 @@ std::shared_ptr SlidingWindowOperation::Build() { } // ToNumberOperation -ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {} +ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} Status ToNumberOperation::ValidateParams() { - if (!data_type_.IsNumeric() || data_type_.IsBool()) { - std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_.ToString(); + if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { + std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } diff --git a/mindspore/ccsrc/minddata/dataset/core/constants.h b/mindspore/ccsrc/minddata/dataset/core/constants.h index 0e0f485ba0..480eb682f0 100644 --- a/mindspore/ccsrc/minddata/dataset/core/constants.h +++ b/mindspore/ccsrc/minddata/dataset/core/constants.h @@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; // Possible values for SPieceTokenizerLoadType enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; +// Possible values for SentencePieceModel +enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; + // Possible values for NormalizeForm enum class NormalizeForm { kNone = 0, diff --git a/mindspore/ccsrc/minddata/dataset/core/global_context.h b/mindspore/ccsrc/minddata/dataset/core/global_context.h index 031c591ed8..bf8c92cc11 100644 --- a/mindspore/ccsrc/minddata/dataset/core/global_context.h +++ b/mindspore/ccsrc/minddata/dataset/core/global_context.h @@ -19,6 +19,7 @@ #include #include +#include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/constants.h" #include "minddata/dataset/util/allocator.h" #include "minddata/dataset/util/status.h" @@ -27,7 +28,6 @@ namespace mindspore { namespace dataset { // forward declare class MemoryPool; -class ConfigManager; class Tensor; class CVTensor; diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc index f8a925bc9f..6f5e3111e5 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc @@ -22,8 +22,10 @@ #include #include #include "minddata/dataset/engine/consumers/tree_consumer.h" -#include "minddata/dataset/engine/tree_adapter.h" +#include "minddata/dataset/engine/datasetops/device_queue_op.h" #include "minddata/dataset/engine/opt/pre/getter_pass.h" +#include "minddata/dataset/engine/tree_adapter.h" +#include "minddata/mindrecord/include/shard_index_generator.h" #ifndef ENABLE_ANDROID #include "minddata/mindrecord/include/shard_header.h" diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h index 3904b6cbb9..8dd040f38a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h @@ -23,6 +23,7 @@ #include #include +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" #include "minddata/dataset/include/datasets.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h index 8720b557e9..7774af336b 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h @@ -24,13 +24,26 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/engine/consumers/tree_consumer.h" +#include "minddata/dataset/engine/data_schema.h" +#include "minddata/dataset/engine/datasetops/filter_op.h" +#include "minddata/dataset/engine/datasetops/map_op/map_op.h" +#include "minddata/dataset/engine/datasetops/project_op.h" +#include "minddata/dataset/engine/datasetops/repeat_op.h" +#include "minddata/dataset/engine/datasetops/shuffle_op.h" +#include "minddata/dataset/engine/datasetops/skip_op.h" +#include "minddata/dataset/engine/datasetops/take_op.h" +#include "minddata/dataset/engine/ir/cache/dataset_cache.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/util/path.h" +#include "minddata/dataset/util/status.h" namespace mindspore { namespace dataset { class Dataset; +class DatasetCache; class SamplerObj; class IRNodePass; class DatasetSizeGetter; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h index 1503baf9d1..031439c3cf 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h @@ -22,6 +22,7 @@ #include #include +#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h" #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc index 840d3082bb..6b8a75e9b9 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc @@ -24,6 +24,7 @@ #include "minddata/dataset/engine/datasetops/source/random_data_op.h" #include "minddata/dataset/util/random.h" #include "minddata/dataset/util/status.h" + namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h index 1894441896..8a0bae512f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h @@ -22,7 +22,9 @@ #include #include +#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" +#include "minddata/dataset/include/samplers.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/include/config.h b/mindspore/ccsrc/minddata/dataset/include/config.h index 2f159c0287..36b6860f69 100644 --- a/mindspore/ccsrc/minddata/dataset/include/config.h +++ b/mindspore/ccsrc/minddata/dataset/include/config.h @@ -20,8 +20,6 @@ #include #include -#include "minddata/dataset/util/log_adapter.h" - namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/include/constants.h b/mindspore/ccsrc/minddata/dataset/include/constants.h index 0e03df5c50..e5474847f4 100644 --- a/mindspore/ccsrc/minddata/dataset/include/constants.h +++ b/mindspore/ccsrc/minddata/dataset/include/constants.h @@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; // Possible values for SPieceTokenizerLoadType enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; +// Possible values for SentencePieceModel +enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; + // Possible values for NormalizeForm enum class NormalizeForm { kNone = 0, diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index 6f686f0659..b5ad1ec77d 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -17,6 +17,7 @@ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_ +#include #include #include #include @@ -26,27 +27,18 @@ #include #include #include -#include "minddata/dataset/engine/ir/cache/dataset_cache.h" -#include "minddata/dataset/core/constants.h" -#include "minddata/dataset/engine/consumers/tree_consumer.h" -#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" #include "minddata/dataset/include/iterator.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/tensor.h" +#include "minddata/dataset/include/text.h" #include "minddata/dataset/include/type_id.h" -#include "minddata/dataset/kernels/c_func_op.h" -#include "minddata/dataset/kernels/tensor_op.h" -#include "minddata/dataset/util/path.h" -#ifndef ENABLE_ANDROID -#include "minddata/dataset/text/sentence_piece_vocab.h" -#include "minddata/dataset/text/vocab.h" -#endif namespace mindspore { namespace dataset { class Tensor; +class TensorRow; class TensorShape; class TreeAdapter; class TreeGetters; @@ -54,6 +46,7 @@ class TreeGetters; class Vocab; #endif +class DatasetCache; class DatasetNode; class Iterator; @@ -77,12 +70,20 @@ class ConcatDataset; class RenameDataset; #endif +#ifndef ENABLE_ANDROID +class SentencePieceVocab; +enum class SentencePieceModel; +#endif + +class DSCallback; + class RepeatDataset; #ifndef ENABLE_ANDROID class SkipDataset; class TakeDataset; class ZipDataset; + #endif /// \class Dataset datasets.h @@ -969,8 +970,12 @@ std::shared_ptr TFRecord(const std::vector &datase } else { std::string schema_path = schema; if (!schema_path.empty()) { - Path schema_file(schema_path); - if (!schema_file.Exists()) { + struct stat sb; + int rc = stat(common::SafeCStr(schema_path), &sb); + if (rc == -1 && errno != ENOENT) { + MS_LOG(WARNING) << "Unable to query the status of [" << schema_path << "]. Errno = " << errno << "."; + } + if (rc != 0) { MS_LOG(ERROR) << "TFRecordDataset: schema path [" << schema_path << "] is invalid or does not exist."; return nullptr; } diff --git a/mindspore/ccsrc/minddata/dataset/include/de_tensor.h b/mindspore/ccsrc/minddata/dataset/include/de_tensor.h index eb7c457b3f..2e20ca12b9 100644 --- a/mindspore/ccsrc/minddata/dataset/include/de_tensor.h +++ b/mindspore/ccsrc/minddata/dataset/include/de_tensor.h @@ -14,14 +14,14 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ #include #include #include #include "include/ms_tensor.h" +#include "minddata/dataset/include/status.h" #include "minddata/dataset/include/tensor.h" -#include "minddata/dataset/util/status.h" namespace mindspore { namespace tensor { class DETensor : public mindspore::tensor::MSTensor { @@ -79,4 +79,4 @@ class DETensor : public mindspore::tensor::MSTensor { }; } // namespace tensor } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ diff --git a/mindspore/ccsrc/minddata/dataset/include/execute.h b/mindspore/ccsrc/minddata/dataset/include/execute.h index 64c1a55c3a..e3d0d1d80a 100644 --- a/mindspore/ccsrc/minddata/dataset/include/execute.h +++ b/mindspore/ccsrc/minddata/dataset/include/execute.h @@ -14,12 +14,13 @@ * limitations under the License. */ -#ifndef DATASET_API_EXECUTE_H_ -#define DATASET_API_EXECUTE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ #include #include -#include "minddata/dataset/core/constants.h" + +#include "minddata/dataset/include/constants.h" #ifdef ENABLE_ANDROID #include "minddata/dataset/include/de_tensor.h" #endif @@ -55,4 +56,4 @@ class Execute { } // namespace dataset } // namespace mindspore -#endif // DATASET_API_EXECUTE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/include/iterator.h b/mindspore/ccsrc/minddata/dataset/include/iterator.h index 1686f1552d..f4a07c36cd 100644 --- a/mindspore/ccsrc/minddata/dataset/include/iterator.h +++ b/mindspore/ccsrc/minddata/dataset/include/iterator.h @@ -21,7 +21,6 @@ #include #include #include -#include "minddata/dataset/engine/runtime_context.h" #include "minddata/dataset/include/status.h" namespace mindspore { @@ -45,10 +44,10 @@ using TensorVec = std::vector>; class Iterator { public: /// \brief Constructor - Iterator() : consumer_(nullptr) {} + Iterator(); /// \brief Destructor - ~Iterator() { Stop(); } + ~Iterator(); /// \brief Method for building and launching the pipeline. /// \param[in] ops - a vector of DatasetOp in the data pipeline. diff --git a/mindspore/ccsrc/minddata/dataset/include/samplers.h b/mindspore/ccsrc/minddata/dataset/include/samplers.h index 99a364da57..1854cb7eed 100644 --- a/mindspore/ccsrc/minddata/dataset/include/samplers.h +++ b/mindspore/ccsrc/minddata/dataset/include/samplers.h @@ -21,10 +21,11 @@ #include #include -#include "minddata/dataset/util/status.h" +#include "minddata/dataset/include/status.h" #ifndef ENABLE_ANDROID #include "minddata/mindrecord/include/shard_column.h" #include "minddata/mindrecord/include/shard_error.h" +#include "minddata/mindrecord/include/shard_operator.h" #include "minddata/mindrecord/include/shard_reader.h" #endif diff --git a/mindspore/ccsrc/minddata/dataset/include/status.h b/mindspore/ccsrc/minddata/dataset/include/status.h index b88f69bbe8..82857e2363 100644 --- a/mindspore/ccsrc/minddata/dataset/include/status.h +++ b/mindspore/ccsrc/minddata/dataset/include/status.h @@ -51,6 +51,13 @@ namespace dataset { } \ } while (false) +#define CHECK_FAIL_RETURN_SYNTAX_ERROR(_condition, _e) \ + do { \ + if (!(_condition)) { \ + return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \ + } \ + } while (false) + #define RETURN_UNEXPECTED_IF_NULL(_ptr) \ do { \ if ((_ptr) == nullptr) { \ @@ -71,6 +78,15 @@ namespace dataset { return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \ } while (false) +#define RETURN_SECOND_IF_ERROR(_s, _r) \ + do { \ + Status __rc = (_s); \ + if (__rc.IsError()) { \ + MS_LOG(ERROR) << __rc; \ + return _r; \ + } \ + } while (false) + enum class StatusCode : char { kOK = 0, kOutOfMemory = 1, @@ -151,6 +167,12 @@ class Status { StatusCode code_; std::string err_msg_; }; + +#if !defined(_WIN32) && !defined(_WIN64) +const float MAX_MEMORY_USAGE_THRESHOLD = 0.95; + +float GetMemoryUsage(); +#endif } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_STATUS_H_ diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index af13020993..d1e8de8c02 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -22,18 +22,16 @@ #include #include -#include "mindspore/ccsrc/minddata/dataset/core/data_type.h" -#include "minddata/dataset/core/constants.h" +#include "minddata/dataset/include/constants.h" +#include "minddata/dataset/include/status.h" #include "minddata/dataset/include/transforms.h" -#include "minddata/dataset/util/status.h" - -#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" -#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { +class Vocab; +class SentencePieceVocab; + // Transform operations for text namespace text { @@ -146,10 +144,11 @@ std::shared_ptr JiebaTokenizer(const std::string &hmm_p /// \param[in] vocab a Vocab object. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov). /// If unknown_token is oov, runtime error will be thrown. -/// \param[in] DataType type of the tensor after lookup, typically int32. +/// \param[in] data_type type of the tensor after lookup, typically int32. /// \return Shared pointer to the current TensorOperation. + std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token, - const mindspore::dataset::DataType &data_type = DataType("int32")); + const std::string &data_type = "int32"); /// \brief TensorOp to generate n-gram from a 1-D string Tensor. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result @@ -226,9 +225,9 @@ std::shared_ptr SlidingWindow(const int32_t width, const /// https://en.cppreference.com/w/cpp/string/basic_string/stof, /// https://en.cppreference.com/w/cpp/string/basic_string/stoul, /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type. -/// \param[in] data_type DataType of the tensor to be casted to. Must be a numeric type. +/// \param[in] data_type of the tensor to be casted to. Must be a numeric type. /// \return Shared pointer to the current TensorOperation. -std::shared_ptr ToNumber(const DataType data_type); +std::shared_ptr ToNumber(const std::string &data_type); /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length. /// \param[in] max_length Maximum length required. @@ -285,7 +284,7 @@ class BertTokenizerOperation : public TensorOperation { bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets); - ~BertTokenizerOperation() = default; + ~BertTokenizerOperation(); std::shared_ptr Build() override; @@ -342,9 +341,9 @@ class JiebaTokenizerOperation : public TensorOperation { class LookupOperation : public TensorOperation { public: explicit LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token, - const DataType &data_type); + const std::string &data_type); - ~LookupOperation() = default; + ~LookupOperation(); std::shared_ptr Build() override; @@ -356,7 +355,7 @@ class LookupOperation : public TensorOperation { std::shared_ptr vocab_; std::string unknown_token_; int32_t default_id_; - DataType data_type_; + std::string data_type_; }; class NgramOperation : public TensorOperation { @@ -439,7 +438,7 @@ class SentencePieceTokenizerOperation : public TensorOperation { SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); - ~SentencePieceTokenizerOperation() = default; + ~SentencePieceTokenizerOperation(); std::shared_ptr Build() override; @@ -473,7 +472,7 @@ class SlidingWindowOperation : public TensorOperation { class ToNumberOperation : public TensorOperation { public: - explicit ToNumberOperation(DataType data_type); + explicit ToNumberOperation(std::string data_type); ~ToNumberOperation() = default; @@ -484,7 +483,7 @@ class ToNumberOperation : public TensorOperation { std::string Name() const override { return kToNumberOperation; } private: - DataType data_type_; + std::string data_type_; }; class TruncateSequencePairOperation : public TensorOperation { diff --git a/mindspore/ccsrc/minddata/dataset/include/type_id.h b/mindspore/ccsrc/minddata/dataset/include/type_id.h index b9ffcc9e8d..d4c0560931 100644 --- a/mindspore/ccsrc/minddata/dataset/include/type_id.h +++ b/mindspore/ccsrc/minddata/dataset/include/type_id.h @@ -16,7 +16,6 @@ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ -#include "minddata/dataset/core/data_type.h" #include "mindspore/core/ir/dtype/type_id.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h index 49fdd038a0..2e78bc8ce6 100644 --- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h +++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h @@ -22,10 +22,11 @@ #include #include #include "minddata/dataset/util/status.h" +#include "minddata/dataset/include/constants.h" namespace mindspore { namespace dataset { -enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; + class SentencePieceVocab { public: static Status BuildFromFile(const std::vector &path_list, const int vocab_size, diff --git a/tests/ut/cpp/dataset/build_vocab_test.cc b/tests/ut/cpp/dataset/build_vocab_test.cc index e3d960debe..c50da85c56 100644 --- a/tests/ut/cpp/dataset/build_vocab_test.cc +++ b/tests/ut/cpp/dataset/build_vocab_test.cc @@ -22,6 +22,7 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/status.h" +#include "minddata/dataset/text/vocab.h" using mindspore::dataset::Tensor; using mindspore::dataset::Status; diff --git a/tests/ut/cpp/dataset/c_api_cache_test.cc b/tests/ut/cpp/dataset/c_api_cache_test.cc index f149d499ce..e0acf16deb 100644 --- a/tests/ut/cpp/dataset/c_api_cache_test.cc +++ b/tests/ut/cpp/dataset/c_api_cache_test.cc @@ -17,8 +17,6 @@ #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/vision.h" -#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" - using namespace mindspore::dataset; // Helper function to get the session id from SESSION_ID env variable @@ -28,7 +26,6 @@ class MindDataTestCacheOp : public UT::DatasetOpTesting { public: void SetUp() override { DatasetOpTesting::SetUp(); - GlobalInit(); } }; diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index 6b74674a2f..ee2fb422ea 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -15,6 +15,7 @@ */ #include "common/common.h" #include "minddata/dataset/core/tensor_row.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/vision.h" diff --git a/tests/ut/cpp/dataset/c_api_samplers_test.cc b/tests/ut/cpp/dataset/c_api_samplers_test.cc index 3d3dde5434..800e567e5c 100644 --- a/tests/ut/cpp/dataset/c_api_samplers_test.cc +++ b/tests/ut/cpp/dataset/c_api_samplers_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ #include "common/common.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" #include "minddata/dataset/include/datasets.h" using namespace mindspore::dataset; diff --git a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc index 884a542785..9a16f39877 100644 --- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc @@ -18,11 +18,12 @@ #include #include "common/common.h" -#include "minddata/dataset/core/constants.h" +#include "minddata/dataset/include/constants.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/status.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/text.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/text/sentence_piece_vocab.h" using namespace mindspore::dataset; using mindspore::dataset::SentencePieceModel; diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 1ad4e44c12..9df3cca6b3 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -21,11 +21,11 @@ #include "minddata/dataset/include/config.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/status.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/text.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/text/vocab.h" using namespace mindspore::dataset; -using mindspore::dataset::DataType; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Status; using mindspore::dataset::Tensor; @@ -1011,7 +1011,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = text::ToNumber(DataType("int64")); + std::shared_ptr to_number = text::ToNumber("int64"); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1064,7 +1064,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = text::ToNumber(DataType("float64")); + std::shared_ptr to_number = text::ToNumber("float64"); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1117,7 +1117,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = text::ToNumber(DataType("int8")); + std::shared_ptr to_number = text::ToNumber("int8"); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1167,7 +1167,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = text::ToNumber(DataType("float16")); + std::shared_ptr to_number = text::ToNumber("float16"); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1213,7 +1213,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = text::ToNumber(DataType("int64")); + std::shared_ptr to_number = text::ToNumber("int64"); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1246,7 +1246,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) { TEST_F(MindDataTestPipeline, TestToNumberFail4) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4."; - // Test ToNumber with non numerical DataType + // Test ToNumber with non numerical data type std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; @@ -1255,15 +1255,15 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number1 = text::ToNumber(DataType("string")); + std::shared_ptr to_number1 = text::ToNumber("string"); - // Expect failure: invalid parameter with non numerical DataType + // Expect failure: invalid parameter with non numerical data type EXPECT_EQ(to_number1, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number2 = text::ToNumber(DataType("bool")); + std::shared_ptr to_number2 = text::ToNumber("bool"); - // Expect failure: invalid parameter with non numerical DataType + // Expect failure: invalid parameter with non numerical data type EXPECT_EQ(to_number2, nullptr); } diff --git a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc index 008330b24a..a01d697153 100644 --- a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc @@ -20,8 +20,9 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/status.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/text.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/text/vocab.h" using namespace mindspore::dataset; using mindspore::dataset::DataType; @@ -49,7 +50,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) { EXPECT_EQ(s, Status::OK()); // Create Lookup operation on ds - std::shared_ptr lookup = text::Lookup(vocab, "", DataType("int32")); + std::shared_ptr lookup = text::Lookup(vocab, "", "int32"); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -93,7 +94,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) { EXPECT_EQ(s, Status::OK()); // Create Lookup operation on ds - std::shared_ptr lookup = text::Lookup(vocab, "", DataType("int32")); + std::shared_ptr lookup = text::Lookup(vocab, "", "int32"); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -137,7 +138,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { // Create lookup op for ds // Expected failure: "" is not a word of vocab - std::shared_ptr lookup = text::Lookup(vocab, "", DataType("int32")); + std::shared_ptr lookup = text::Lookup(vocab, "", "int32"); EXPECT_EQ(lookup, nullptr); } @@ -148,7 +149,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) { // Create lookup op // Expected failure: vocab is null - std::shared_ptr lookup = text::Lookup(vocab, "", DataType("int32")); + std::shared_ptr lookup = text::Lookup(vocab, "", "int32"); EXPECT_EQ(lookup, nullptr); } @@ -170,7 +171,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) { EXPECT_EQ(home_index, 4); // Create Lookup operation on ds - std::shared_ptr lookup = text::Lookup(vocab, "", DataType("int32")); + std::shared_ptr lookup = text::Lookup(vocab, "", "int32"); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -324,7 +325,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) { EXPECT_EQ(home_index, 2); // Create Lookup operation on ds - std::shared_ptr lookup = text::Lookup(vocab, "home", DataType("int64")); + std::shared_ptr lookup = text::Lookup(vocab, "home", "int64"); EXPECT_NE(lookup, nullptr); // Create Map operation on ds diff --git a/tests/ut/cpp/dataset/ir_callback_test.cc b/tests/ut/cpp/dataset/ir_callback_test.cc index a37110eb05..69359518f6 100644 --- a/tests/ut/cpp/dataset/ir_callback_test.cc +++ b/tests/ut/cpp/dataset/ir_callback_test.cc @@ -21,6 +21,7 @@ #include "minddata/dataset/callback/ds_callback.h" #include "minddata/dataset/core/client.h" #include "minddata/dataset/engine/datasetops/source/random_data_op.h" +#include "minddata/dataset/engine/tree_adapter.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/kernels/data/no_op.h"