diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index d597a72443..f8f74c86c0 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -55,24 +55,44 @@ // IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/repeat_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" #include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" #ifndef ENABLE_ANDROID #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" #endif -// IR leaf nodes -#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" - #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/util/path.h" #include "minddata/dataset/util/random.h" +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" + +// IR leaf nodes disabled for android +#ifndef ENABLE_ANDROID +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" +#endif + namespace mindspore { namespace dataset { namespace api { @@ -852,1033 +872,8 @@ Status ValidateDatasetColumnParam(const std::string &dataset_name, const std::st return Status::OK(); } -/* ####################################### Derived Dataset classes ################################# */ - -// DERIVED DATASET CLASSES LEAF-NODE DATASETS -// (In alphabetical order) - -// Constructor for AlbumNode -AlbumNode::AlbumNode(const std::string &dataset_dir, const std::string &data_schema, - const std::vector &column_names, bool decode, - const std::shared_ptr &sampler) - : dataset_dir_(dataset_dir), - schema_path_(data_schema), - column_names_(column_names), - decode_(decode), - sampler_(sampler) {} - -Status AlbumNode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("AlbumNode", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetFilesParam("AlbumNode", {schema_path_})); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("AlbumNode", sampler_)); - - if (!column_names_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("AlbumNode", "column_names", column_names_)); - } - - return Status::OK(); -} - -// Function to build AlbumNode -std::vector> AlbumNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_)); - - // Argument that is not exposed to user in the API. - std::set extensions = {}; - - node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, - decode_, extensions, std::move(schema), std::move(sampler_->Build()))); - return node_ops; -} - -// Constructor for CelebANode -CelebANode::CelebANode(const std::string &dataset_dir, const std::string &usage, - const std::shared_ptr &sampler, const bool &decode, - const std::set &extensions, const std::shared_ptr &cache) - : Dataset(cache), - dataset_dir_(dataset_dir), - usage_(usage), - sampler_(sampler), - decode_(decode), - extensions_(extensions) {} - -Status CelebANode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("CelebANode", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("CelebANode", sampler_)); - - RETURN_IF_NOT_OK(ValidateStringValue("CelebANode", usage_, {"all", "train", "valid", "test"})); - - return Status::OK(); -} - -// Function to build CelebANode -std::vector> CelebANode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - std::unique_ptr schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - // label is like this:0 1 0 0 1...... - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("attr", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - - node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, - decode_, usage_, extensions_, std::move(schema), - std::move(sampler_->Build()))); - - return node_ops; -} - -// Constructor for Cifar10Node -Cifar10Node::Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, - std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} - -Status Cifar10Node::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar10Node", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("Cifar10Node", sampler_)); - - RETURN_IF_NOT_OK(ValidateStringValue("Cifar10Node", usage_, {"train", "test", "all"})); - - return Status::OK(); -} - -// Function to build CifarOp for Cifar10 -std::vector> Cifar10Node::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - // Do internal Schema generation. - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); - TensorShape scalar = TensorShape::CreateScalar(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); - - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(std::make_shared(CifarOp::CifarType::kCifar10, usage_, num_workers_, rows_per_buffer_, - dataset_dir_, connector_que_size_, std::move(schema), - std::move(sampler_->Build()))); - - return node_ops; -} - -// Constructor for Cifar100Node -Cifar100Node::Cifar100Node(const std::string &dataset_dir, const std::string &usage, - std::shared_ptr sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} - -Status Cifar100Node::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar100Node", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("Cifar100Node", sampler_)); - - RETURN_IF_NOT_OK(ValidateStringValue("Cifar100Node", usage_, {"train", "test", "all"})); - - return Status::OK(); -} - -// Function to build CifarOp for Cifar100 -std::vector> Cifar100Node::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - // Do internal Schema generation. - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); - TensorShape scalar = TensorShape::CreateScalar(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("coarse_label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("fine_label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); - - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(std::make_shared(CifarOp::CifarType::kCifar100, usage_, num_workers_, rows_per_buffer_, - dataset_dir_, connector_que_size_, std::move(schema), - std::move(sampler_->Build()))); - - return node_ops; -} - -// Constructor for CLUENode -CLUENode::CLUENode(const std::vector clue_files, std::string task, std::string usage, int64_t num_samples, - ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_files_(clue_files), - task_(task), - usage_(usage), - num_samples_(num_samples), - shuffle_(shuffle), - num_shards_(num_shards), - shard_id_(shard_id) {} - -Status CLUENode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetFilesParam("CLUENode", dataset_files_)); - - RETURN_IF_NOT_OK(ValidateStringValue("CLUENode", task_, {"AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC", "CSL"})); - - RETURN_IF_NOT_OK(ValidateStringValue("CLUENode", usage_, {"train", "test", "eval"})); - - if (num_samples_ < 0) { - std::string err_msg = "CLUENode: Invalid number of samples: " + std::to_string(num_samples_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateDatasetShardParams("CLUENode", num_shards_, shard_id_)); - - return Status::OK(); -} - -// Function to split string based on a character delimiter -std::vector CLUENode::split(const std::string &s, char delim) { - std::vector res; - std::stringstream ss(s); - std::string item; - - while (getline(ss, item, delim)) { - res.push_back(item); - } - return res; -} - -// Function to build CLUENode -std::vector> CLUENode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - std::map key_map; - if (task_ == "AFQMC") { - if (usage_ == "train") { - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - key_map["label"] = "label"; - } else if (usage_ == "test") { - key_map["id"] = "id"; - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - } else if (usage_ == "eval") { - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - key_map["label"] = "label"; - } - } else if (task_ == "CMNLI") { - if (usage_ == "train") { - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - key_map["label"] = "label"; - } else if (usage_ == "test") { - key_map["id"] = "id"; - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - } else if (usage_ == "eval") { - key_map["sentence1"] = "sentence1"; - key_map["sentence2"] = "sentence2"; - key_map["label"] = "label"; - } - } else if (task_ == "CSL") { - if (usage_ == "train") { - key_map["id"] = "id"; - key_map["abst"] = "abst"; - key_map["keyword"] = "keyword"; - key_map["label"] = "label"; - } else if (usage_ == "test") { - key_map["id"] = "id"; - key_map["abst"] = "abst"; - key_map["keyword"] = "keyword"; - } else if (usage_ == "eval") { - key_map["id"] = "id"; - key_map["abst"] = "abst"; - key_map["keyword"] = "keyword"; - key_map["label"] = "label"; - } - } else if (task_ == "IFLYTEK") { - if (usage_ == "train") { - key_map["label"] = "label"; - key_map["label_des"] = "label_des"; - key_map["sentence"] = "sentence"; - } else if (usage_ == "test") { - key_map["id"] = "id"; - key_map["sentence"] = "sentence"; - } else if (usage_ == "eval") { - key_map["label"] = "label"; - key_map["label_des"] = "label_des"; - key_map["sentence"] = "sentence"; - } - } else if (task_ == "TNEWS") { - if (usage_ == "train") { - key_map["label"] = "label"; - key_map["label_desc"] = "label_desc"; - key_map["sentence"] = "sentence"; - key_map["keywords"] = "keywords"; - } else if (usage_ == "test") { - key_map["id"] = "id"; - key_map["sentence"] = "sentence"; - key_map["keywords"] = "keywords"; - } else if (usage_ == "eval") { - key_map["label"] = "label"; - key_map["label_desc"] = "label_desc"; - key_map["sentence"] = "sentence"; - key_map["keywords"] = "keywords"; - } - } else if (task_ == "WSC") { - if (usage_ == "train") { - key_map["span1_index"] = "target/span1_index"; - key_map["span2_index"] = "target/span2_index"; - key_map["span1_text"] = "target/span1_text"; - key_map["span2_text"] = "target/span2_text"; - key_map["idx"] = "idx"; - key_map["label"] = "label"; - key_map["text"] = "text"; - } else if (usage_ == "test") { - key_map["span1_index"] = "target/span1_index"; - key_map["span2_index"] = "target/span2_index"; - key_map["span1_text"] = "target/span1_text"; - key_map["span2_text"] = "target/span2_text"; - key_map["idx"] = "idx"; - key_map["text"] = "text"; - } else if (usage_ == "eval") { - key_map["span1_index"] = "target/span1_index"; - key_map["span2_index"] = "target/span2_index"; - key_map["span1_text"] = "target/span1_text"; - key_map["span2_text"] = "target/span2_text"; - key_map["idx"] = "idx"; - key_map["label"] = "label"; - key_map["text"] = "text"; - } - } - - ColKeyMap ck_map; - for (auto &p : key_map) { - ck_map.insert({p.first, split(p.second, '/')}); - } - - bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); - - // Sort the dataset files in a lexicographical order - std::vector sorted_dataset_files = dataset_files_; - std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); - - std::shared_ptr clue_op = - std::make_shared(num_workers_, rows_per_buffer_, num_samples_, worker_connector_size_, ck_map, - sorted_dataset_files, connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); - RETURN_EMPTY_IF_ERROR(clue_op->Init()); - if (shuffle_ == ShuffleMode::kGlobal) { - // Inject ShuffleOp - std::shared_ptr shuffle_op = nullptr; - int64_t num_rows = 0; - - // First, get the number of rows in the dataset - RETURN_EMPTY_IF_ERROR(ClueOp::CountAllFileRows(sorted_dataset_files, &num_rows)); - - // Add the shuffle op after this op - RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, - rows_per_buffer_, &shuffle_op)); - node_ops.push_back(shuffle_op); - } - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(clue_op); - - return node_ops; -} - -// Constructor for CocoNode -CocoNode::CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, - const bool &decode, const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_dir_(dataset_dir), - annotation_file_(annotation_file), - task_(task), - decode_(decode), - sampler_(sampler) {} - -Status CocoNode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("CocoNode", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("CocoNode", sampler_)); - - Path annotation_file(annotation_file_); - if (!annotation_file.Exists()) { - std::string err_msg = "CocoNode: annotation_file is invalid or does not exist."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateStringValue("CocoNode", task_, {"Detection", "Stuff", "Panoptic", "Keypoint"})); - - return Status::OK(); -} - -// Function to build CocoNode -std::vector> CocoNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - CocoOp::TaskType task_type; - if (task_ == "Detection") { - task_type = CocoOp::TaskType::Detection; - } else if (task_ == "Stuff") { - task_type = CocoOp::TaskType::Stuff; - } else if (task_ == "Keypoint") { - task_type = CocoOp::TaskType::Keypoint; - } else if (task_ == "Panoptic") { - task_type = CocoOp::TaskType::Panoptic; - } - - std::unique_ptr schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor(std::string("image"), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - switch (task_type) { - case CocoOp::TaskType::Detection: - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("bbox"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("category_id"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - break; - case CocoOp::TaskType::Stuff: - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("segmentation"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - break; - case CocoOp::TaskType::Keypoint: - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("keypoints"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("num_keypoints"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - break; - case CocoOp::TaskType::Panoptic: - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("bbox"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("category_id"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor(std::string("area"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - break; - default: - MS_LOG(ERROR) << "CocoNode::Build : Invalid task type"; - return {}; - } - std::shared_ptr op = - std::make_shared(task_type, dataset_dir_, annotation_file_, num_workers_, rows_per_buffer_, - connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build())); - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(op); - - return node_ops; -} - -// Constructor for CSVNode -CSVNode::CSVNode(const std::vector &csv_files, char field_delim, - const std::vector> &column_defaults, - const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, - int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_files_(csv_files), - field_delim_(field_delim), - column_defaults_(column_defaults), - column_names_(column_names), - num_samples_(num_samples), - shuffle_(shuffle), - num_shards_(num_shards), - shard_id_(shard_id) {} - -Status CSVNode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetFilesParam("CSVNode", dataset_files_)); - - if (field_delim_ == '"' || field_delim_ == '\r' || field_delim_ == '\n') { - std::string err_msg = "CSVNode: The field delimiter should not be \", \\r, \\n"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (num_samples_ < 0) { - std::string err_msg = "CSVNode: Invalid number of samples: " + std::to_string(num_samples_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateDatasetShardParams("CSVNode", num_shards_, shard_id_)); - - if (find(column_defaults_.begin(), column_defaults_.end(), nullptr) != column_defaults_.end()) { - std::string err_msg = "CSVNode: column_default should not be null."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (!column_names_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("CSVNode", "column_names", column_names_)); - } - - return Status::OK(); -} - -// Function to build CSVNode -std::vector> CSVNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); - - // Sort the dataset files in a lexicographical order - std::vector sorted_dataset_files = dataset_files_; - std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); - - std::vector> column_default_list; - for (auto v : column_defaults_) { - if (v->type == CsvType::INT) { - column_default_list.push_back( - std::make_shared>(CsvOp::INT, std::dynamic_pointer_cast>(v)->value)); - } else if (v->type == CsvType::FLOAT) { - column_default_list.push_back( - std::make_shared>(CsvOp::FLOAT, std::dynamic_pointer_cast>(v)->value)); - } else if (v->type == CsvType::STRING) { - column_default_list.push_back(std::make_shared>( - CsvOp::STRING, std::dynamic_pointer_cast>(v)->value)); - } - } - - std::shared_ptr csv_op = std::make_shared( - sorted_dataset_files, field_delim_, column_default_list, column_names_, num_workers_, rows_per_buffer_, - num_samples_, worker_connector_size_, connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); - RETURN_EMPTY_IF_ERROR(csv_op->Init()); - if (shuffle_ == ShuffleMode::kGlobal) { - // Inject ShuffleOp - std::shared_ptr shuffle_op = nullptr; - int64_t num_rows = 0; - - // First, get the number of rows in the dataset - RETURN_EMPTY_IF_ERROR(CsvOp::CountAllFileRows(sorted_dataset_files, column_names_.empty(), &num_rows)); - - // Add the shuffle op after this op - RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, - rows_per_buffer_, &shuffle_op)); - - node_ops.push_back(shuffle_op); - } - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(csv_op); - - return node_ops; -} -#ifndef ENABLE_ANDROID -ManifestNode::ManifestNode(const std::string &dataset_file, const std::string &usage, - const std::shared_ptr &sampler, - const std::map &class_indexing, bool decode, - std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_file_(dataset_file), - usage_(usage), - decode_(decode), - class_index_(class_indexing), - sampler_(sampler) {} - -Status ManifestNode::ValidateParams() { - std::vector forbidden_symbols = {':', '*', '?', '"', '<', '>', '|', '`', '&', '\'', ';'}; - for (char c : dataset_file_) { - auto p = std::find(forbidden_symbols.begin(), forbidden_symbols.end(), c); - if (p != forbidden_symbols.end()) { - std::string err_msg = "ManifestNode: filename should not contain :*?\"<>|`&;\'"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - - Path manifest_file(dataset_file_); - if (!manifest_file.Exists()) { - std::string err_msg = "ManifestNode: dataset file: [" + dataset_file_ + "] is invalid or not exist"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateDatasetSampler("ManifestNode", sampler_)); - - RETURN_IF_NOT_OK(ValidateStringValue("ManifestNode", usage_, {"train", "eval", "inference"})); - - return Status::OK(); -} - -std::vector> ManifestNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - // Do internal Schema generation. - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); - TensorShape scalar = TensorShape::CreateScalar(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); - - std::shared_ptr manifest_op; - manifest_op = - std::make_shared(num_workers_, rows_per_buffer_, dataset_file_, connector_que_size_, decode_, - class_index_, std::move(schema), std::move(sampler_->Build()), usage_); - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(manifest_op); - - return node_ops; -} -#endif - -#ifndef ENABLE_ANDROID -MindDataNode::MindDataNode(const std::vector &dataset_files, const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded) - : dataset_file_(std::string()), - dataset_files_(dataset_files), - search_for_pattern_(false), - columns_list_(columns_list), - sampler_(sampler), - padded_sample_(padded_sample), - sample_bytes_({}), - num_padded_(num_padded) {} - -MindDataNode::MindDataNode(const std::string &dataset_file, const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded) - : dataset_file_(dataset_file), - dataset_files_({}), - search_for_pattern_(true), - columns_list_(columns_list), - sampler_(sampler), - padded_sample_(padded_sample), - sample_bytes_({}), - num_padded_(num_padded) {} - -Status MindDataNode::ValidateParams() { - if (!search_for_pattern_ && dataset_files_.size() > 4096) { - std::string err_msg = - "MindDataNode: length of dataset_file must be less than or equal to 4096, dataset_file length: " + - std::to_string(dataset_file_.size()); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - std::vector dataset_file_vec = - search_for_pattern_ ? std::vector{dataset_file_} : dataset_files_; - RETURN_IF_NOT_OK(ValidateDatasetFilesParam("MindDataNode", dataset_file_vec)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("MindDataNode", sampler_)); - - if (!columns_list_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MindDataNode", "columns_list", columns_list_)); - } - - if (padded_sample_ != nullptr) { - if (num_padded_ < 0) { - std::string err_msg = - "MindDataNode: num_padded must be greater than or equal to zero, num_padded: " + std::to_string(num_padded_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - if (columns_list_.empty()) { - std::string err_msg = "MindDataNode: padded_sample is specified and requires columns_list as well"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - for (std::string &column : columns_list_) { - if (padded_sample_.find(column) == padded_sample_.end()) { - std::string err_msg = "MindDataNode: " + column + " in columns_list does not match any column in padded_sample"; - MS_LOG(ERROR) << err_msg << ", padded_sample: " << padded_sample_; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - } - if (num_padded_ > 0) { - if (padded_sample_ == nullptr) { - std::string err_msg = "MindDataNode: num_padded is specified but padded_sample is not"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - - return Status::OK(); -} - -// Helper function to create runtime sampler for minddata dataset -Status MindDataNode::BuildMindDatasetSamplerChain(const std::shared_ptr &sampler, - std::vector> *operators_, - int64_t num_padded) { - std::shared_ptr op = sampler->BuildForMindDataset(); - if (op == nullptr) { - std::string err_msg = - "MindDataNode: Unsupported sampler is supplied for MindDataset. Supported sampler list: " - "SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler and DistributedSampler"; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - std::stack> stack_ops; - while (op != nullptr) { - auto sampler_op = std::dynamic_pointer_cast(op); - if (sampler_op && num_padded > 0) { - sampler_op->SetNumPaddedSamples(num_padded); - stack_ops.push(sampler_op); - } else { - stack_ops.push(op); - } - op = op->GetChildOp(); - } - while (!stack_ops.empty()) { - operators_->push_back(stack_ops.top()); - stack_ops.pop(); - } - return Status::OK(); -} - -// Helper function to set sample_bytes from py::byte type -void MindDataNode::SetSampleBytes(std::map *sample_bytes) { sample_bytes_ = *sample_bytes; } - -std::vector> MindDataNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - std::vector> operators_; - RETURN_EMPTY_IF_ERROR(BuildMindDatasetSamplerChain(sampler_, &operators_, num_padded_)); - - std::shared_ptr mindrecord_op; - // If pass a string to MindData(), it will be treated as a pattern to search for matched files, - // else if pass a vector to MindData(), it will be treated as specified files to be read - if (search_for_pattern_) { - std::vector dataset_file_vec_ = {dataset_file_}; - mindrecord_op = std::make_shared(num_workers_, rows_per_buffer_, dataset_file_vec_, - search_for_pattern_, connector_que_size_, columns_list_, operators_, - num_padded_, padded_sample_, sample_bytes_); - } else { - mindrecord_op = std::make_shared(num_workers_, rows_per_buffer_, dataset_files_, search_for_pattern_, - connector_que_size_, columns_list_, operators_, num_padded_, - padded_sample_, sample_bytes_); - } - - RETURN_EMPTY_IF_ERROR(mindrecord_op->Init()); - node_ops.push_back(mindrecord_op); - - return node_ops; -} -#endif - -MnistNode::MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, - std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} - -Status MnistNode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetDirParam("MnistNode", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("MnistNode", sampler_)); - - RETURN_IF_NOT_OK(ValidateStringValue("MnistNode", usage_, {"train", "test", "all"})); - - return Status::OK(); -} - -std::vector> MnistNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - // Do internal Schema generation. - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); - TensorShape scalar = TensorShape::CreateScalar(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(std::make_shared(usage_, num_workers_, rows_per_buffer_, dataset_dir_, - connector_que_size_, std::move(schema), std::move(sampler_->Build()))); - - return node_ops; -} - -// ValideParams for RandomNode -Status RandomNode::ValidateParams() { - if (total_rows_ < 0) { - std::string err_msg = - "RandomNode: total_rows must be greater than or equal 0, now get " + std::to_string(total_rows_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateDatasetSampler("RandomNode", sampler_)); - - if (!columns_list_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("RandomNode", "columns_list", columns_list_)); - } - - return Status::OK(); -} - -int32_t RandomNode::GenRandomInt(int32_t min, int32_t max) { - std::uniform_int_distribution uniDist(min, max); - return uniDist(rand_gen_); -} - -// Build for RandomNode -std::vector> RandomNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - rand_gen_.seed(GetSeed()); // seed the random generator - // If total rows was not given, then randomly pick a number - std::shared_ptr schema_obj; - if (!schema_path_.empty()) { - schema_obj = Schema(schema_path_); - if (schema_obj == nullptr) { - return {}; - } - } - - std::string schema_json_string, schema_file_path; - if (schema_ != nullptr) { - schema_->set_dataset_type("Random"); - if (total_rows_ != 0) { - schema_->set_num_rows(total_rows_); - } - schema_json_string = schema_->to_json(); - } else { - schema_file_path = schema_path_; - } - - std::unique_ptr data_schema; - std::vector columns_to_load; - if (columns_list_.size() > 0) { - columns_to_load = columns_list_; - } - if (!schema_file_path.empty() || !schema_json_string.empty()) { - data_schema = std::make_unique(); - if (!schema_file_path.empty()) { - data_schema->LoadSchemaFile(schema_file_path, columns_to_load); - } else if (!schema_json_string.empty()) { - data_schema->LoadSchemaString(schema_json_string, columns_to_load); - } - } - std::shared_ptr op; - op = std::make_shared(num_workers_, connector_que_size_, rows_per_buffer_, total_rows_, - std::move(data_schema), std::move(sampler_->Build())); - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(op); - - return node_ops; -} - -// Constructor for TextFileNode -TextFileNode::TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, - int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_files_(dataset_files), - num_samples_(num_samples), - shuffle_(shuffle), - num_shards_(num_shards), - shard_id_(shard_id) {} - -Status TextFileNode::ValidateParams() { - RETURN_IF_NOT_OK(ValidateDatasetFilesParam("TextFileNode", dataset_files_)); - - if (num_samples_ < 0) { - std::string err_msg = "TextFileNode: Invalid number of samples: " + std::to_string(num_samples_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateDatasetShardParams("TextFileNode", num_shards_, shard_id_)); - - return Status::OK(); -} - -// Function to build TextFileNode -std::vector> TextFileNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); - - // Sort the dataset files in a lexicographical order - std::vector sorted_dataset_files = dataset_files_; - std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); - - // Do internal Schema generation. - auto schema = std::make_unique(); - RETURN_EMPTY_IF_ERROR( - schema->AddColumn(ColDescriptor("text", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - - // Create and initalize TextFileOp - std::shared_ptr text_file_op = std::make_shared( - num_workers_, rows_per_buffer_, num_samples_, worker_connector_size_, std::move(schema), sorted_dataset_files, - connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); - RETURN_EMPTY_IF_ERROR(text_file_op->Init()); - - if (shuffle_ == ShuffleMode::kGlobal) { - // Inject ShuffleOp - std::shared_ptr shuffle_op = nullptr; - int64_t num_rows = 0; - - // First, get the number of rows in the dataset - RETURN_EMPTY_IF_ERROR(TextFileOp::CountAllFileRows(sorted_dataset_files, &num_rows)); - - // Add the shuffle op after this op - RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, - rows_per_buffer_, &shuffle_op)); - node_ops.push_back(shuffle_op); - } - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - // Add TextFileOp - node_ops.push_back(text_file_op); - - return node_ops; -} - #ifndef ENABLE_ANDROID -// Validator for TFRecordNode -Status TFRecordNode::ValidateParams() { return Status::OK(); } - -// Function to build TFRecordNode -std::vector> TFRecordNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - // Sort the datasets file in a lexicographical order - std::vector sorted_dir_files = dataset_files_; - std::sort(sorted_dir_files.begin(), sorted_dir_files.end()); - - // Create Schema Object - std::unique_ptr data_schema = std::make_unique(); - if (!schema_path_.empty()) { - RETURN_EMPTY_IF_ERROR(data_schema->LoadSchemaFile(schema_path_, columns_list_)); - } else if (schema_obj_ != nullptr) { - std::string schema_json_string = schema_obj_->to_json(); - RETURN_EMPTY_IF_ERROR(data_schema->LoadSchemaString(schema_json_string, columns_list_)); - } - - bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); - - // Create and initialize TFReaderOp - std::shared_ptr tf_reader_op = std::make_shared( - num_workers_, worker_connector_size_, rows_per_buffer_, num_samples_, sorted_dir_files, std::move(data_schema), - connector_que_size_, columns_list_, shuffle_files, num_shards_, shard_id_, shard_equal_rows_, nullptr); - - RETURN_EMPTY_IF_ERROR(tf_reader_op->Init()); - - if (shuffle_ == ShuffleMode::kGlobal) { - // Inject ShuffleOp - - std::shared_ptr shuffle_op = nullptr; - int64_t num_rows = 0; - - // First, get the number of rows in the dataset - RETURN_EMPTY_IF_ERROR(TFReaderOp::CountTotalRows(&num_rows, sorted_dir_files)); - - // Add the shuffle op after this op - RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dir_files.size(), num_shards_, num_rows, 0, connector_que_size_, - rows_per_buffer_, &shuffle_op)); - node_ops.push_back(shuffle_op); - } - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - // Add TFReaderOp - node_ops.push_back(tf_reader_op); - return node_ops; -} -// Constructor for VOCNode -VOCNode::VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, - const std::map &class_indexing, bool decode, std::shared_ptr sampler, - std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_dir_(dataset_dir), - task_(task), - usage_(usage), - class_index_(class_indexing), - decode_(decode), - sampler_(sampler) {} - -Status VOCNode::ValidateParams() { - Path dir(dataset_dir_); - - RETURN_IF_NOT_OK(ValidateDatasetDirParam("VOCNode", dataset_dir_)); - - RETURN_IF_NOT_OK(ValidateDatasetSampler("VOCNode", sampler_)); - - if (task_ == "Segmentation") { - if (!class_index_.empty()) { - std::string err_msg = "VOCNode: class_indexing is invalid in Segmentation task."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - Path imagesets_file = dir / "ImageSets" / "Segmentation" / usage_ + ".txt"; - if (!imagesets_file.Exists()) { - std::string err_msg = "VOCNode: Invalid usage: " + usage_ + ", file does not exist"; - MS_LOG(ERROR) << "VOCNode: Invalid usage: " << usage_ << ", file \"" << imagesets_file << "\" does not exist!"; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } else if (task_ == "Detection") { - Path imagesets_file = dir / "ImageSets" / "Main" / usage_ + ".txt"; - if (!imagesets_file.Exists()) { - std::string err_msg = "VOCNode: Invalid usage: " + usage_ + ", file does not exist"; - MS_LOG(ERROR) << "VOCNode: Invalid usage: " << usage_ << ", file \"" << imagesets_file << "\" does not exist!"; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } else { - std::string err_msg = "VOCNode: Invalid task: " + task_; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -// Function to build VOCNode -std::vector> VOCNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - auto schema = std::make_unique(); - VOCOp::TaskType task_type_; - - if (task_ == "Segmentation") { - task_type_ = VOCOp::TaskType::Segmentation; - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnTarget), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - } else if (task_ == "Detection") { - task_type_ = VOCOp::TaskType::Detection; - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnLabel), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnDifficult), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - RETURN_EMPTY_IF_ERROR(schema->AddColumn( - ColDescriptor(std::string(kColumnTruncate), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); - } - - std::shared_ptr voc_op; - voc_op = std::make_shared(task_type_, usage_, dataset_dir_, class_index_, num_workers_, rows_per_buffer_, - connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build())); - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(voc_op); - return node_ops; -} std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t mem_sz, bool spill, std::optional hostname, std::optional port, std::optional num_connections, @@ -1888,124 +883,6 @@ std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t me } #endif -#ifndef ENABLE_ANDROID - -#endif - -MapNode::MapNode(std::shared_ptr child, std::vector> operations, - std::vector input_columns, std::vector output_columns, - const std::vector &project_columns, std::shared_ptr cache) - : operations_(operations), - input_columns_(input_columns), - output_columns_(output_columns), - project_columns_(project_columns), - Dataset(std::move(cache)) { - this->children.push_back(child); -} - -std::vector> MapNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - std::vector> tensor_ops; - - // Build tensorOp from tensorOperation vector - // This is to ensure each iterator hold its own copy of the tensorOp objects. - (void)std::transform( - operations_.begin(), operations_.end(), std::back_inserter(tensor_ops), - [](std::shared_ptr operation) -> std::shared_ptr { return operation->Build(); }); - - // This parameter will be removed with next rebase - std::vector col_orders; - auto map_op = std::make_shared(input_columns_, output_columns_, tensor_ops, num_workers_, connector_que_size_); - if (!project_columns_.empty()) { - auto project_op = std::make_shared(project_columns_); - node_ops.push_back(project_op); - } - RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); - - node_ops.push_back(map_op); - return node_ops; -} - -Status MapNode::ValidateParams() { - if (operations_.empty()) { - std::string err_msg = "MapNode: No operation is specified."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (!input_columns_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "input_columns", input_columns_)); - } - - if (!output_columns_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "output_columns", output_columns_)); - } - - if (!project_columns_.empty()) { - RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "project_columns", project_columns_)); - } - - return Status::OK(); -} - -// Constructor for SkipNode -SkipNode::SkipNode(std::shared_ptr child, int32_t count) : skip_count_(count) { - this->children.push_back(child); -} - -// Function to build the SkipOp -std::vector> SkipNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - node_ops.push_back(std::make_shared(skip_count_, connector_que_size_)); - return node_ops; -} - -// Function to validate the parameters for SkipNode -Status SkipNode::ValidateParams() { - if (skip_count_ <= -1) { - std::string err_msg = "SkipNode: skip_count should not be negative, skip_count: " + std::to_string(skip_count_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -// Function to build ZipOp -ZipNode::ZipNode(const std::vector> &datasets) : datasets_(datasets) { - for (auto dataset : datasets_) { - this->children.push_back(dataset); - } -} - -Status ZipNode::ValidateParams() { - if (datasets_.empty()) { - std::string err_msg = "ZipNode: datasets to zip are not specified."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (find(datasets_.begin(), datasets_.end(), nullptr) != datasets_.end()) { - std::string err_msg = "ZipNode: zip datasets should not be null."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::vector> ZipNode::Build() { - // A vector containing shared pointer to the Dataset Ops that this object will create - std::vector> node_ops; - - node_ops.push_back(std::make_shared(rows_per_buffer_, connector_que_size_)); - return node_ops; -} - } // namespace api } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc index a323d27877..8ffabb7260 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt index 5450e41fc2..446b2195ed 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt @@ -4,19 +4,17 @@ add_subdirectory(source) set(DATASET_ENGINE_IR_DATASETOPS_SRC_FILES batch_node.cc + bucket_batch_by_length_node.cc + build_vocab_node.cc concat_node.cc + map_node.cc project_node.cc rename_node.cc repeat_node.cc shuffle_node.cc + skip_node.cc take_node.cc + zip_node.cc ) -if (NOT ENABLE_ANDROID) - set(DATASET_ENGINE_IR_DATASETOPS_SRC_FILES - ${DATASET_ENGINE_IR_DATASETOPS_SRC_FILES} - bucket_batch_by_length_node.cc - build_vocab_node.cc) -endif () - add_library(engine-ir-datasetops OBJECT ${DATASET_ENGINE_IR_DATASETOPS_SRC_FILES}) diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h index 9274703752..64f861721e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUCKET_BATCH_BY_LENGTH_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUCKET_BATCH_BY_LENGTH_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUCKET_BATCH_BY_LENGTH_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUCKET_BATCH_BY_LENGTH_NODE_H_ #include #include @@ -61,4 +61,4 @@ class BucketBatchByLengthNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUCKET_BATCH_BY_LENGTH_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUCKET_BATCH_BY_LENGTH_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h index 67686c0062..a7a20c3897 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUILD_VOCAB_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUILD_VOCAB_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_VOCAB_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_VOCAB_NODE_H_ #include #include @@ -58,4 +58,4 @@ class BuildVocabNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_BUILD_VOCAB_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_VOCAB_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h index e4b3e38db8..f0b9fcae94 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CONCAT_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CONCAT_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_CONCAT_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_CONCAT_NODE_H_ #include #include @@ -50,4 +50,4 @@ class ConcatNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CONCAT_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_CONCAT_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc new file mode 100644 index 0000000000..cd27a48936 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc @@ -0,0 +1,91 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/map_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/map_op/map_op.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +MapNode::MapNode(std::shared_ptr child, std::vector> operations, + std::vector input_columns, std::vector output_columns, + const std::vector &project_columns, std::shared_ptr cache) + : operations_(operations), + input_columns_(input_columns), + output_columns_(output_columns), + project_columns_(project_columns), + Dataset(std::move(cache)) { + this->children.push_back(child); +} + +std::vector> MapNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::vector> tensor_ops; + + // Build tensorOp from tensorOperation vector + // This is to ensure each iterator hold its own copy of the tensorOp objects. + (void)std::transform( + operations_.begin(), operations_.end(), std::back_inserter(tensor_ops), + [](std::shared_ptr operation) -> std::shared_ptr { return operation->Build(); }); + + // This parameter will be removed with next rebase + std::vector col_orders; + auto map_op = std::make_shared(input_columns_, output_columns_, tensor_ops, num_workers_, connector_que_size_); + if (!project_columns_.empty()) { + auto project_op = std::make_shared(project_columns_); + node_ops.push_back(project_op); + } + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(map_op); + return node_ops; +} + +Status MapNode::ValidateParams() { + if (operations_.empty()) { + std::string err_msg = "MapNode: No operation is specified."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (!input_columns_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "input_columns", input_columns_)); + } + + if (!output_columns_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "output_columns", output_columns_)); + } + + if (!project_columns_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MapNode", "project_columns", project_columns_)); + } + + return Status::OK(); +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h new file mode 100644 index 0000000000..0eec6981b7 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_MAP_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_MAP_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +class MapNode : public Dataset { + public: + /// \brief Constructor + MapNode(std::shared_ptr child, std::vector> operations, + std::vector input_columns = {}, std::vector output_columns = {}, + const std::vector &columns = {}, std::shared_ptr cache = nullptr); + + /// \brief Destructor + ~MapNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::vector> operations_; + std::vector input_columns_; + std::vector output_columns_; + std::vector project_columns_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_MAP_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h index bd163a57bc..5faf88a986 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_PROJECT_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_PROJECT_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_PROJECT_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_PROJECT_NODE_H_ #include #include @@ -51,4 +51,4 @@ class ProjectNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_PROJECT_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_PROJECT_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h index 2f905835a5..11d8975056 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RENAME_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RENAME_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_RENAME_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_RENAME_NODE_H_ #include #include @@ -53,4 +53,4 @@ class RenameNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RENAME_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_RENAME_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h index b363fd66f7..d893da637a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_REPEAT_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_REPEAT_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_REPEAT_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_REPEAT_NODE_H_ #include #include @@ -53,4 +53,4 @@ class RepeatNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_REPEAT_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_REPEAT_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h index 748c2f3ba4..07a1503f24 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SHUFFLE_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SHUFFLE_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SHUFFLE_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SHUFFLE_NODE_H_ #include #include @@ -49,4 +49,4 @@ class ShuffleNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SHUFFLE_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SHUFFLE_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc new file mode 100644 index 0000000000..5fa20dae1f --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" + +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/skip_op.h" +#include "minddata/dataset/util/status.h" + +namespace mindspore { +namespace dataset { +namespace api { + +// Constructor for SkipNode +SkipNode::SkipNode(std::shared_ptr child, int32_t count) : skip_count_(count) { + this->children.push_back(child); +} + +// Function to build the SkipOp +std::vector> SkipNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + node_ops.push_back(std::make_shared(skip_count_, connector_que_size_)); + return node_ops; +} + +// Function to validate the parameters for SkipNode +Status SkipNode::ValidateParams() { + if (skip_count_ <= -1) { + std::string err_msg = "SkipNode: skip_count should not be negative, skip_count: " + std::to_string(skip_count_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + return Status::OK(); +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h new file mode 100644 index 0000000000..81c6d26311 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h @@ -0,0 +1,52 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SKIP_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SKIP_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { + +namespace api { +class SkipNode : public Dataset { + public: + /// \brief Constructor + explicit SkipNode(std::shared_ptr child, int32_t count); + + /// \brief Destructor + ~SkipNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + int32_t skip_count_; +}; +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SKIP_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt index 244d6586d0..92bcef432c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt @@ -2,7 +2,21 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES + album_node.cc + celeba_node.cc + cifar100_node.cc + cifar10_node.cc + clue_node.cc + coco_node.cc + csv_node.cc image_folder_node.cc + manifest_node.cc + minddata_node.cc + mnist_node.cc + random_node.cc + text_file_node.cc + tf_record_node.cc + voc_node.cc ) if (ENABLE_PYTHON) diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc new file mode 100644 index 0000000000..51588eae5c --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc @@ -0,0 +1,73 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/album_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for AlbumNode +AlbumNode::AlbumNode(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, + const std::shared_ptr &sampler) + : dataset_dir_(dataset_dir), + schema_path_(data_schema), + column_names_(column_names), + decode_(decode), + sampler_(sampler) {} + +Status AlbumNode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("AlbumNode", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetFilesParam("AlbumNode", {schema_path_})); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("AlbumNode", sampler_)); + + if (!column_names_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("AlbumNode", "column_names", column_names_)); + } + + return Status::OK(); +} + +// Function to build AlbumNode +std::vector> AlbumNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_)); + + // Argument that is not exposed to user in the API. + std::set extensions = {}; + + node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, + decode_, extensions, std::move(schema), std::move(sampler_->Build()))); + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h new file mode 100644 index 0000000000..8cfeb9b9ad --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h @@ -0,0 +1,58 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_ALBUM_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_ALBUM_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class AlbumNode : public Dataset { + public: + /// \brief Constructor + AlbumNode(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, const std::shared_ptr &sampler); + + /// \brief Destructor + ~AlbumNode() = default; + + /// \brief a base class override function to create a runtime dataset op object from this class + /// \return shared pointer to the newly created DatasetOp + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string schema_path_; + std::vector column_names_; + bool decode_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_ALBUM_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc new file mode 100644 index 0000000000..e46fb580ea --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc @@ -0,0 +1,72 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/celeba_op.h" +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for CelebANode +CelebANode::CelebANode(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, const bool &decode, + const std::set &extensions, const std::shared_ptr &cache) + : Dataset(cache), + dataset_dir_(dataset_dir), + usage_(usage), + sampler_(sampler), + decode_(decode), + extensions_(extensions) {} + +Status CelebANode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("CelebANode", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("CelebANode", sampler_)); + + RETURN_IF_NOT_OK(ValidateStringValue("CelebANode", usage_, {"all", "train", "valid", "test"})); + + return Status::OK(); +} + +// Function to build CelebANode +std::vector> CelebANode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::unique_ptr schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + // label is like this:0 1 0 0 1...... + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("attr", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + + node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, + decode_, usage_, extensions_, std::move(schema), + std::move(sampler_->Build()))); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h new file mode 100644 index 0000000000..30a539cbec --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h @@ -0,0 +1,60 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CELEBA_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CELEBA_NODE_H_ + +#include +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class CelebANode : public Dataset { + public: + /// \brief Constructor + CelebANode(const std::string &dataset_dir, const std::string &usage, const std::shared_ptr &sampler, + const bool &decode, const std::set &extensions, const std::shared_ptr &cache); + + /// \brief Destructor + ~CelebANode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return shared pointer to the list of newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string usage_; + bool decode_; + std::set extensions_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CELEBA_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc new file mode 100644 index 0000000000..465cdcdc02 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc @@ -0,0 +1,71 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/cifar_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +// Constructor for Cifar100Node +Cifar100Node::Cifar100Node(const std::string &dataset_dir, const std::string &usage, + std::shared_ptr sampler, std::shared_ptr cache) + : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + +Status Cifar100Node::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar100Node", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("Cifar100Node", sampler_)); + + RETURN_IF_NOT_OK(ValidateStringValue("Cifar100Node", usage_, {"train", "test", "all"})); + + return Status::OK(); +} + +// Function to build CifarOp for Cifar100 +std::vector> Cifar100Node::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); + TensorShape scalar = TensorShape::CreateScalar(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("coarse_label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("fine_label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(std::make_shared(CifarOp::CifarType::kCifar100, usage_, num_workers_, rows_per_buffer_, + dataset_dir_, connector_que_size_, std::move(schema), + std::move(sampler_->Build()))); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h new file mode 100644 index 0000000000..ab336b1cd3 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h @@ -0,0 +1,56 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR100_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR100_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class Cifar100Node : public Dataset { + public: + /// \brief Constructor + Cifar100Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, + std::shared_ptr cache); + + /// \brief Destructor + ~Cifar100Node() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string usage_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR100_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc new file mode 100644 index 0000000000..208a101e4e --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc @@ -0,0 +1,69 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/cifar_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +// Constructor for Cifar10Node +Cifar10Node::Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, + std::shared_ptr cache) + : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + +Status Cifar10Node::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar10Node", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("Cifar10Node", sampler_)); + + RETURN_IF_NOT_OK(ValidateStringValue("Cifar10Node", usage_, {"train", "test", "all"})); + + return Status::OK(); +} + +// Function to build CifarOp for Cifar10 +std::vector> Cifar10Node::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); + TensorShape scalar = TensorShape::CreateScalar(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(std::make_shared(CifarOp::CifarType::kCifar10, usage_, num_workers_, rows_per_buffer_, + dataset_dir_, connector_que_size_, std::move(schema), + std::move(sampler_->Build()))); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h new file mode 100644 index 0000000000..5832c1446f --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h @@ -0,0 +1,56 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR10_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR10_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class Cifar10Node : public Dataset { + public: + /// \brief Constructor + Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, + std::shared_ptr cache); + + /// \brief Destructor + ~Cifar10Node() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string usage_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CIFAR10_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc new file mode 100644 index 0000000000..c6cbb59544 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc @@ -0,0 +1,218 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" + +#include +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/clue_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +// Constructor for CLUENode +CLUENode::CLUENode(const std::vector clue_files, std::string task, std::string usage, int64_t num_samples, + ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_files_(clue_files), + task_(task), + usage_(usage), + num_samples_(num_samples), + shuffle_(shuffle), + num_shards_(num_shards), + shard_id_(shard_id) {} + +Status CLUENode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetFilesParam("CLUENode", dataset_files_)); + + RETURN_IF_NOT_OK(ValidateStringValue("CLUENode", task_, {"AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC", "CSL"})); + + RETURN_IF_NOT_OK(ValidateStringValue("CLUENode", usage_, {"train", "test", "eval"})); + + if (num_samples_ < 0) { + std::string err_msg = "CLUENode: Invalid number of samples: " + std::to_string(num_samples_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateDatasetShardParams("CLUENode", num_shards_, shard_id_)); + + return Status::OK(); +} + +// Function to split string based on a character delimiter +std::vector CLUENode::split(const std::string &s, char delim) { + std::vector res; + std::stringstream ss(s); + std::string item; + + while (getline(ss, item, delim)) { + res.push_back(item); + } + return res; +} + +// Function to build CLUENode +std::vector> CLUENode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::map key_map; + if (task_ == "AFQMC") { + if (usage_ == "train") { + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + key_map["label"] = "label"; + } else if (usage_ == "test") { + key_map["id"] = "id"; + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + } else if (usage_ == "eval") { + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + key_map["label"] = "label"; + } + } else if (task_ == "CMNLI") { + if (usage_ == "train") { + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + key_map["label"] = "label"; + } else if (usage_ == "test") { + key_map["id"] = "id"; + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + } else if (usage_ == "eval") { + key_map["sentence1"] = "sentence1"; + key_map["sentence2"] = "sentence2"; + key_map["label"] = "label"; + } + } else if (task_ == "CSL") { + if (usage_ == "train") { + key_map["id"] = "id"; + key_map["abst"] = "abst"; + key_map["keyword"] = "keyword"; + key_map["label"] = "label"; + } else if (usage_ == "test") { + key_map["id"] = "id"; + key_map["abst"] = "abst"; + key_map["keyword"] = "keyword"; + } else if (usage_ == "eval") { + key_map["id"] = "id"; + key_map["abst"] = "abst"; + key_map["keyword"] = "keyword"; + key_map["label"] = "label"; + } + } else if (task_ == "IFLYTEK") { + if (usage_ == "train") { + key_map["label"] = "label"; + key_map["label_des"] = "label_des"; + key_map["sentence"] = "sentence"; + } else if (usage_ == "test") { + key_map["id"] = "id"; + key_map["sentence"] = "sentence"; + } else if (usage_ == "eval") { + key_map["label"] = "label"; + key_map["label_des"] = "label_des"; + key_map["sentence"] = "sentence"; + } + } else if (task_ == "TNEWS") { + if (usage_ == "train") { + key_map["label"] = "label"; + key_map["label_desc"] = "label_desc"; + key_map["sentence"] = "sentence"; + key_map["keywords"] = "keywords"; + } else if (usage_ == "test") { + key_map["id"] = "id"; + key_map["sentence"] = "sentence"; + key_map["keywords"] = "keywords"; + } else if (usage_ == "eval") { + key_map["label"] = "label"; + key_map["label_desc"] = "label_desc"; + key_map["sentence"] = "sentence"; + key_map["keywords"] = "keywords"; + } + } else if (task_ == "WSC") { + if (usage_ == "train") { + key_map["span1_index"] = "target/span1_index"; + key_map["span2_index"] = "target/span2_index"; + key_map["span1_text"] = "target/span1_text"; + key_map["span2_text"] = "target/span2_text"; + key_map["idx"] = "idx"; + key_map["label"] = "label"; + key_map["text"] = "text"; + } else if (usage_ == "test") { + key_map["span1_index"] = "target/span1_index"; + key_map["span2_index"] = "target/span2_index"; + key_map["span1_text"] = "target/span1_text"; + key_map["span2_text"] = "target/span2_text"; + key_map["idx"] = "idx"; + key_map["text"] = "text"; + } else if (usage_ == "eval") { + key_map["span1_index"] = "target/span1_index"; + key_map["span2_index"] = "target/span2_index"; + key_map["span1_text"] = "target/span1_text"; + key_map["span2_text"] = "target/span2_text"; + key_map["idx"] = "idx"; + key_map["label"] = "label"; + key_map["text"] = "text"; + } + } + + ColKeyMap ck_map; + for (auto &p : key_map) { + ck_map.insert({p.first, split(p.second, '/')}); + } + + bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); + + // Sort the dataset files in a lexicographical order + std::vector sorted_dataset_files = dataset_files_; + std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); + + std::shared_ptr clue_op = + std::make_shared(num_workers_, rows_per_buffer_, num_samples_, worker_connector_size_, ck_map, + sorted_dataset_files, connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); + RETURN_EMPTY_IF_ERROR(clue_op->Init()); + if (shuffle_ == ShuffleMode::kGlobal) { + // Inject ShuffleOp + std::shared_ptr shuffle_op = nullptr; + int64_t num_rows = 0; + + // First, get the number of rows in the dataset + RETURN_EMPTY_IF_ERROR(ClueOp::CountAllFileRows(sorted_dataset_files, &num_rows)); + + // Add the shuffle op after this op + RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, + rows_per_buffer_, &shuffle_op)); + node_ops.push_back(shuffle_op); + } + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(clue_op); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h new file mode 100644 index 0000000000..42cbc1aa6e --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CLUE_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CLUE_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +/// \class CLUENode +/// \brief A Dataset derived class to represent CLUE dataset +class CLUENode : public Dataset { + public: + /// \brief Constructor + CLUENode(const std::vector dataset_files, std::string task, std::string usage, int64_t num_samples, + ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache); + + /// \brief Destructor + ~CLUENode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + /// \brief Split string based on a character delimiter + /// \return A string vector + std::vector split(const std::string &s, char delim); + + std::vector dataset_files_; + std::string task_; + std::string usage_; + int64_t num_samples_; + ShuffleMode shuffle_; + int32_t num_shards_; + int32_t shard_id_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CLUE_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc new file mode 100644 index 0000000000..adb933bfa4 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc @@ -0,0 +1,122 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/coco_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for CocoNode +CocoNode::CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, + const bool &decode, const std::shared_ptr &sampler, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_dir_(dataset_dir), + annotation_file_(annotation_file), + task_(task), + decode_(decode), + sampler_(sampler) {} + +Status CocoNode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("CocoNode", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("CocoNode", sampler_)); + + Path annotation_file(annotation_file_); + if (!annotation_file.Exists()) { + std::string err_msg = "CocoNode: annotation_file is invalid or does not exist."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateStringValue("CocoNode", task_, {"Detection", "Stuff", "Panoptic", "Keypoint"})); + + return Status::OK(); +} + +// Function to build CocoNode +std::vector> CocoNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + CocoOp::TaskType task_type; + if (task_ == "Detection") { + task_type = CocoOp::TaskType::Detection; + } else if (task_ == "Stuff") { + task_type = CocoOp::TaskType::Stuff; + } else if (task_ == "Keypoint") { + task_type = CocoOp::TaskType::Keypoint; + } else if (task_ == "Panoptic") { + task_type = CocoOp::TaskType::Panoptic; + } + + std::unique_ptr schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor(std::string("image"), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + switch (task_type) { + case CocoOp::TaskType::Detection: + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("bbox"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("category_id"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + break; + case CocoOp::TaskType::Stuff: + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("segmentation"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + break; + case CocoOp::TaskType::Keypoint: + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("keypoints"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("num_keypoints"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + break; + case CocoOp::TaskType::Panoptic: + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("bbox"), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("category_id"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string("iscrowd"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor(std::string("area"), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + break; + default: + MS_LOG(ERROR) << "CocoNode::Build : Invalid task type"; + return {}; + } + std::shared_ptr op = + std::make_shared(task_type, dataset_dir_, annotation_file_, num_workers_, rows_per_buffer_, + connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build())); + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(op); + + return node_ops; +} +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h new file mode 100644 index 0000000000..3398d33707 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_COCO_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_COCO_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +class CocoNode : public Dataset { + public: + /// \brief Constructor + CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, + const bool &decode, const std::shared_ptr &sampler, std::shared_ptr cache); + + /// \brief Destructor + ~CocoNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return shared pointer to the list of newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string annotation_file_; + std::string task_; + bool decode_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_COCO_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc new file mode 100644 index 0000000000..f44688a17a --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc @@ -0,0 +1,127 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/csv_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for CSVNode +CSVNode::CSVNode(const std::vector &csv_files, char field_delim, + const std::vector> &column_defaults, + const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_files_(csv_files), + field_delim_(field_delim), + column_defaults_(column_defaults), + column_names_(column_names), + num_samples_(num_samples), + shuffle_(shuffle), + num_shards_(num_shards), + shard_id_(shard_id) {} + +Status CSVNode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetFilesParam("CSVNode", dataset_files_)); + + if (field_delim_ == '"' || field_delim_ == '\r' || field_delim_ == '\n') { + std::string err_msg = "CSVNode: The field delimiter should not be \", \\r, \\n"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (num_samples_ < 0) { + std::string err_msg = "CSVNode: Invalid number of samples: " + std::to_string(num_samples_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateDatasetShardParams("CSVNode", num_shards_, shard_id_)); + + if (find(column_defaults_.begin(), column_defaults_.end(), nullptr) != column_defaults_.end()) { + std::string err_msg = "CSVNode: column_default should not be null."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (!column_names_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("CSVNode", "column_names", column_names_)); + } + + return Status::OK(); +} + +// Function to build CSVNode +std::vector> CSVNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); + + // Sort the dataset files in a lexicographical order + std::vector sorted_dataset_files = dataset_files_; + std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); + + std::vector> column_default_list; + for (auto v : column_defaults_) { + if (v->type == CsvType::INT) { + column_default_list.push_back( + std::make_shared>(CsvOp::INT, std::dynamic_pointer_cast>(v)->value)); + } else if (v->type == CsvType::FLOAT) { + column_default_list.push_back( + std::make_shared>(CsvOp::FLOAT, std::dynamic_pointer_cast>(v)->value)); + } else if (v->type == CsvType::STRING) { + column_default_list.push_back(std::make_shared>( + CsvOp::STRING, std::dynamic_pointer_cast>(v)->value)); + } + } + + std::shared_ptr csv_op = std::make_shared( + sorted_dataset_files, field_delim_, column_default_list, column_names_, num_workers_, rows_per_buffer_, + num_samples_, worker_connector_size_, connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); + RETURN_EMPTY_IF_ERROR(csv_op->Init()); + if (shuffle_ == ShuffleMode::kGlobal) { + // Inject ShuffleOp + std::shared_ptr shuffle_op = nullptr; + int64_t num_rows = 0; + + // First, get the number of rows in the dataset + RETURN_EMPTY_IF_ERROR(CsvOp::CountAllFileRows(sorted_dataset_files, column_names_.empty(), &num_rows)); + + // Add the shuffle op after this op + RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, + rows_per_buffer_, &shuffle_op)); + + node_ops.push_back(shuffle_op); + } + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(csv_op); + + return node_ops; +} +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h new file mode 100644 index 0000000000..6e53985b49 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h @@ -0,0 +1,82 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CSV_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CSV_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { /// \brief Base class of CSV Record +/// \brief Record type for CSV +enum CsvType : uint8_t { INT = 0, FLOAT, STRING }; + +class CsvBase { + public: + CsvBase() = default; + explicit CsvBase(CsvType t) : type(t) {} + virtual ~CsvBase() {} + CsvType type; +}; + +/// \brief CSV Record that can represent integer, float and string. +template +class CsvRecord : public CsvBase { + public: + CsvRecord() = default; + CsvRecord(CsvType t, T v) : CsvBase(t), value(v) {} + ~CsvRecord() {} + T value; +}; + +class CSVNode : public Dataset { + public: + /// \brief Constructor + CSVNode(const std::vector &dataset_files, char field_delim, + const std::vector> &column_defaults, const std::vector &column_names, + int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, + std::shared_ptr cache); + + /// \brief Destructor + ~CSVNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return shared pointer to the list of newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::vector dataset_files_; + char field_delim_; + std::vector> column_defaults_; + std::vector column_names_; + int64_t num_samples_; + ShuffleMode shuffle_; + int32_t num_shards_; + int32_t shard_id_; +}; +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CSV_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc new file mode 100644 index 0000000000..d884f3fec6 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc @@ -0,0 +1,90 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/manifest_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +ManifestNode::ManifestNode(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, + const std::map &class_indexing, bool decode, + std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_file_(dataset_file), + usage_(usage), + decode_(decode), + class_index_(class_indexing), + sampler_(sampler) {} + +Status ManifestNode::ValidateParams() { + std::vector forbidden_symbols = {':', '*', '?', '"', '<', '>', '|', '`', '&', '\'', ';'}; + for (char c : dataset_file_) { + auto p = std::find(forbidden_symbols.begin(), forbidden_symbols.end(), c); + if (p != forbidden_symbols.end()) { + std::string err_msg = "ManifestNode: filename should not contain :*?\"<>|`&;\'"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } + + Path manifest_file(dataset_file_); + if (!manifest_file.Exists()) { + std::string err_msg = "ManifestNode: dataset file: [" + dataset_file_ + "] is invalid or not exist"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateDatasetSampler("ManifestNode", sampler_)); + + RETURN_IF_NOT_OK(ValidateStringValue("ManifestNode", usage_, {"train", "eval", "inference"})); + + return Status::OK(); +} + +std::vector> ManifestNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); + TensorShape scalar = TensorShape::CreateScalar(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + + std::shared_ptr manifest_op; + manifest_op = + std::make_shared(num_workers_, rows_per_buffer_, dataset_file_, connector_que_size_, decode_, + class_index_, std::move(schema), std::move(sampler_->Build()), usage_); + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(manifest_op); + + return node_ops; +} +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h new file mode 100644 index 0000000000..449144fa24 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MANIFEST_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MANIFEST_NODE_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +class ManifestNode : public Dataset { + public: + /// \brief Constructor + ManifestNode(const std::string &dataset_file, const std::string &usage, const std::shared_ptr &sampler, + const std::map &class_indexing, bool decode, std::shared_ptr cache); + + /// \brief Destructor + ~ManifestNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_file_; + std::string usage_; + bool decode_; + std::map class_index_; + std::shared_ptr sampler_; +}; +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MANIFEST_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc new file mode 100644 index 0000000000..4093b2e032 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc @@ -0,0 +1,165 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +MindDataNode::MindDataNode(const std::vector &dataset_files, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded) + : dataset_file_(std::string()), + dataset_files_(dataset_files), + search_for_pattern_(false), + columns_list_(columns_list), + sampler_(sampler), + padded_sample_(padded_sample), + sample_bytes_({}), + num_padded_(num_padded) {} + +MindDataNode::MindDataNode(const std::string &dataset_file, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded) + : dataset_file_(dataset_file), + dataset_files_({}), + search_for_pattern_(true), + columns_list_(columns_list), + sampler_(sampler), + padded_sample_(padded_sample), + sample_bytes_({}), + num_padded_(num_padded) {} + +Status MindDataNode::ValidateParams() { + if (!search_for_pattern_ && dataset_files_.size() > 4096) { + std::string err_msg = + "MindDataNode: length of dataset_file must be less than or equal to 4096, dataset_file length: " + + std::to_string(dataset_file_.size()); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + std::vector dataset_file_vec = + search_for_pattern_ ? std::vector{dataset_file_} : dataset_files_; + RETURN_IF_NOT_OK(ValidateDatasetFilesParam("MindDataNode", dataset_file_vec)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("MindDataNode", sampler_)); + + if (!columns_list_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MindDataNode", "columns_list", columns_list_)); + } + + if (padded_sample_ != nullptr) { + if (num_padded_ < 0) { + std::string err_msg = + "MindDataNode: num_padded must be greater than or equal to zero, num_padded: " + std::to_string(num_padded_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + if (columns_list_.empty()) { + std::string err_msg = "MindDataNode: padded_sample is specified and requires columns_list as well"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + for (std::string &column : columns_list_) { + if (padded_sample_.find(column) == padded_sample_.end()) { + std::string err_msg = "MindDataNode: " + column + " in columns_list does not match any column in padded_sample"; + MS_LOG(ERROR) << err_msg << ", padded_sample: " << padded_sample_; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } + } + if (num_padded_ > 0) { + if (padded_sample_ == nullptr) { + std::string err_msg = "MindDataNode: num_padded is specified but padded_sample is not"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } + + return Status::OK(); +} + +// Helper function to create runtime sampler for minddata dataset +Status MindDataNode::BuildMindDatasetSamplerChain(const std::shared_ptr &sampler, + std::vector> *operators_, + int64_t num_padded) { + std::shared_ptr op = sampler->BuildForMindDataset(); + if (op == nullptr) { + std::string err_msg = + "MindDataNode: Unsupported sampler is supplied for MindDataset. Supported sampler list: " + "SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler and DistributedSampler"; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + std::stack> stack_ops; + while (op != nullptr) { + auto sampler_op = std::dynamic_pointer_cast(op); + if (sampler_op && num_padded > 0) { + sampler_op->SetNumPaddedSamples(num_padded); + stack_ops.push(sampler_op); + } else { + stack_ops.push(op); + } + op = op->GetChildOp(); + } + while (!stack_ops.empty()) { + operators_->push_back(stack_ops.top()); + stack_ops.pop(); + } + return Status::OK(); +} + +// Helper function to set sample_bytes from py::byte type +void MindDataNode::SetSampleBytes(std::map *sample_bytes) { sample_bytes_ = *sample_bytes; } + +std::vector> MindDataNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::vector> operators_; + RETURN_EMPTY_IF_ERROR(BuildMindDatasetSamplerChain(sampler_, &operators_, num_padded_)); + + std::shared_ptr mindrecord_op; + // If pass a string to MindData(), it will be treated as a pattern to search for matched files, + // else if pass a vector to MindData(), it will be treated as specified files to be read + if (search_for_pattern_) { + std::vector dataset_file_vec_ = {dataset_file_}; + mindrecord_op = std::make_shared(num_workers_, rows_per_buffer_, dataset_file_vec_, + search_for_pattern_, connector_que_size_, columns_list_, operators_, + num_padded_, padded_sample_, sample_bytes_); + } else { + mindrecord_op = std::make_shared(num_workers_, rows_per_buffer_, dataset_files_, search_for_pattern_, + connector_que_size_, columns_list_, operators_, num_padded_, + padded_sample_, sample_bytes_); + } + + RETURN_EMPTY_IF_ERROR(mindrecord_op->Init()); + node_ops.push_back(mindrecord_op); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h new file mode 100644 index 0000000000..16fed6f5f0 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h @@ -0,0 +1,75 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +class MindDataNode : public Dataset { + public: + /// \brief Constructor + MindDataNode(const std::vector &dataset_files, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded); + + /// \brief Constructor + MindDataNode(const std::string &dataset_file, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded); + + /// \brief Destructor + ~MindDataNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + /// \brief Build sampler chain for minddata dataset + /// \return Status Status::OK() if input sampler is valid + Status BuildMindDatasetSamplerChain(const std::shared_ptr &sampler, + std::vector> *operators_, + int64_t num_padded); + + /// \brief Set sample_bytes when padded_sample has py::byte value + /// \note Pybind will use this function to set sample_bytes into MindDataNode + void SetSampleBytes(std::map *sample_bytes); + + private: + std::string dataset_file_; // search_for_pattern_ will be true in this mode + std::vector dataset_files_; // search_for_pattern_ will be false in this mode + bool search_for_pattern_; + std::vector columns_list_; + std::shared_ptr sampler_; + nlohmann::json padded_sample_; + std::map sample_bytes_; // enable in python + int64_t num_padded_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc new file mode 100644 index 0000000000..d5ee2b49a0 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/mnist_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +MnistNode::MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, + std::shared_ptr cache) + : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + +Status MnistNode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetDirParam("MnistNode", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("MnistNode", sampler_)); + + RETURN_IF_NOT_OK(ValidateStringValue("MnistNode", usage_, {"train", "test", "all"})); + + return Status::OK(); +} + +std::vector> MnistNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); + TensorShape scalar = TensorShape::CreateScalar(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(std::make_shared(usage_, num_workers_, rows_per_buffer_, dataset_dir_, + connector_que_size_, std::move(schema), std::move(sampler_->Build()))); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h new file mode 100644 index 0000000000..97fb371ce1 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h @@ -0,0 +1,56 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MNIST_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MNIST_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class MnistNode : public Dataset { + public: + /// \brief Constructor + MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, + std::shared_ptr cache); + + /// \brief Destructor + ~MnistNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::string dataset_dir_; + std::string usage_; + std::shared_ptr sampler_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MNIST_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc new file mode 100644 index 0000000000..f02b4c5873 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc @@ -0,0 +1,104 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/random_data_op.h" +#include "minddata/dataset/util/random.h" +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// ValidateParams for RandomNode +Status RandomNode::ValidateParams() { + if (total_rows_ < 0) { + std::string err_msg = + "RandomNode: total_rows must be greater than or equal 0, now get " + std::to_string(total_rows_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateDatasetSampler("RandomNode", sampler_)); + + if (!columns_list_.empty()) { + RETURN_IF_NOT_OK(ValidateDatasetColumnParam("RandomNode", "columns_list", columns_list_)); + } + + return Status::OK(); +} + +int32_t RandomNode::GenRandomInt(int32_t min, int32_t max) { + std::uniform_int_distribution uniDist(min, max); + return uniDist(rand_gen_); +} + +// Build for RandomNode +std::vector> RandomNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + rand_gen_.seed(GetSeed()); // seed the random generator + // If total rows was not given, then randomly pick a number + std::shared_ptr schema_obj; + if (!schema_path_.empty()) { + schema_obj = Schema(schema_path_); + if (schema_obj == nullptr) { + return {}; + } + } + + std::string schema_json_string, schema_file_path; + if (schema_ != nullptr) { + schema_->set_dataset_type("Random"); + if (total_rows_ != 0) { + schema_->set_num_rows(total_rows_); + } + schema_json_string = schema_->to_json(); + } else { + schema_file_path = schema_path_; + } + + std::unique_ptr data_schema; + std::vector columns_to_load; + if (columns_list_.size() > 0) { + columns_to_load = columns_list_; + } + if (!schema_file_path.empty() || !schema_json_string.empty()) { + data_schema = std::make_unique(); + if (!schema_file_path.empty()) { + data_schema->LoadSchemaFile(schema_file_path, columns_to_load); + } else if (!schema_json_string.empty()) { + data_schema->LoadSchemaString(schema_json_string, columns_to_load); + } + } + std::shared_ptr op; + op = std::make_shared(num_workers_, connector_que_size_, rows_per_buffer_, total_rows_, + std::move(data_schema), std::move(sampler_->Build())); + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(op); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h new file mode 100644 index 0000000000..aa1c07fcc2 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h @@ -0,0 +1,86 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RANDOM_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RANDOM_NODE_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class RandomNode : public Dataset { + public: + // Some constants to provide limits to random generation. + static constexpr int32_t kMaxNumColumns = 4; + static constexpr int32_t kMaxRank = 4; + static constexpr int32_t kMaxDimValue = 32; + + /// \brief Constructor + RandomNode(const int32_t &total_rows, std::shared_ptr schema, const std::vector &columns_list, + const std::shared_ptr &sampler, std::shared_ptr cache) + : Dataset(std::move(cache)), + total_rows_(total_rows), + schema_path_(""), + schema_(std::move(schema)), + columns_list_(columns_list), + sampler_(std::move(sampler)) {} + + /// \brief Constructor + RandomNode(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, + const std::shared_ptr &sampler, std::shared_ptr cache) + : Dataset(std::move(cache)), + total_rows_(total_rows), + schema_path_(schema_path), + columns_list_(columns_list), + sampler_(std::move(sampler)) {} + + /// \brief Destructor + ~RandomNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + /// \brief A quick inline for producing a random number between (and including) min/max + /// \param[in] min minimum number that can be generated. + /// \param[in] max maximum number that can be generated. + /// \return The generated random number + int32_t GenRandomInt(int32_t min, int32_t max); + + int32_t total_rows_; + std::string schema_path_; + std::shared_ptr schema_; + std::vector columns_list_; + std::shared_ptr sampler_; + std::mt19937 rand_gen_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_RANDOM_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc new file mode 100644 index 0000000000..d445ebcd67 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc @@ -0,0 +1,100 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/text_file_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for TextFileNode +TextFileNode::TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_files_(dataset_files), + num_samples_(num_samples), + shuffle_(shuffle), + num_shards_(num_shards), + shard_id_(shard_id) {} + +Status TextFileNode::ValidateParams() { + RETURN_IF_NOT_OK(ValidateDatasetFilesParam("TextFileNode", dataset_files_)); + + if (num_samples_ < 0) { + std::string err_msg = "TextFileNode: Invalid number of samples: " + std::to_string(num_samples_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateDatasetShardParams("TextFileNode", num_shards_, shard_id_)); + + return Status::OK(); +} + +// Function to build TextFileNode +std::vector> TextFileNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); + + // Sort the dataset files in a lexicographical order + std::vector sorted_dataset_files = dataset_files_; + std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end()); + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("text", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + + // Create and initalize TextFileOp + std::shared_ptr text_file_op = std::make_shared( + num_workers_, rows_per_buffer_, num_samples_, worker_connector_size_, std::move(schema), sorted_dataset_files, + connector_que_size_, shuffle_files, num_shards_, shard_id_, nullptr); + RETURN_EMPTY_IF_ERROR(text_file_op->Init()); + + if (shuffle_ == ShuffleMode::kGlobal) { + // Inject ShuffleOp + std::shared_ptr shuffle_op = nullptr; + int64_t num_rows = 0; + + // First, get the number of rows in the dataset + RETURN_EMPTY_IF_ERROR(TextFileOp::CountAllFileRows(sorted_dataset_files, &num_rows)); + + // Add the shuffle op after this op + RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, + rows_per_buffer_, &shuffle_op)); + node_ops.push_back(shuffle_op); + } + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + // Add TextFileOp + node_ops.push_back(text_file_op); + + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h new file mode 100644 index 0000000000..2bfd384187 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h @@ -0,0 +1,59 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TEXT_FILE_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TEXT_FILE_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +/// \class TextFileNode +/// \brief A Dataset derived class to represent TextFile dataset +class TextFileNode : public Dataset { + public: + /// \brief Constructor + TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, int32_t num_shards, + int32_t shard_id, std::shared_ptr cache); + + /// \brief Destructor + ~TextFileNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::vector dataset_files_; + int32_t num_samples_; + int32_t num_shards_; + int32_t shard_id_; + ShuffleMode shuffle_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TEXT_FILE_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc new file mode 100644 index 0000000000..5e8d5ef6fe --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc @@ -0,0 +1,85 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/jagged_connector.h" +#include "minddata/dataset/engine/datasetops/source/tf_reader_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Validator for TFRecordNode +Status TFRecordNode::ValidateParams() { return Status::OK(); } + +// Function to build TFRecordNode +std::vector> TFRecordNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // Sort the datasets file in a lexicographical order + std::vector sorted_dir_files = dataset_files_; + std::sort(sorted_dir_files.begin(), sorted_dir_files.end()); + + // Create Schema Object + std::unique_ptr data_schema = std::make_unique(); + if (!schema_path_.empty()) { + RETURN_EMPTY_IF_ERROR(data_schema->LoadSchemaFile(schema_path_, columns_list_)); + } else if (schema_obj_ != nullptr) { + std::string schema_json_string = schema_obj_->to_json(); + RETURN_EMPTY_IF_ERROR(data_schema->LoadSchemaString(schema_json_string, columns_list_)); + } + + bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); + + // Create and initialize TFReaderOp + std::shared_ptr tf_reader_op = std::make_shared( + num_workers_, worker_connector_size_, rows_per_buffer_, num_samples_, sorted_dir_files, std::move(data_schema), + connector_que_size_, columns_list_, shuffle_files, num_shards_, shard_id_, shard_equal_rows_, nullptr); + + RETURN_EMPTY_IF_ERROR(tf_reader_op->Init()); + + if (shuffle_ == ShuffleMode::kGlobal) { + // Inject ShuffleOp + + std::shared_ptr shuffle_op = nullptr; + int64_t num_rows = 0; + + // First, get the number of rows in the dataset + RETURN_EMPTY_IF_ERROR(TFReaderOp::CountTotalRows(&num_rows, sorted_dir_files)); + + // Add the shuffle op after this op + RETURN_EMPTY_IF_ERROR(AddShuffleOp(sorted_dir_files.size(), num_shards_, num_rows, 0, connector_que_size_, + rows_per_buffer_, &shuffle_op)); + node_ops.push_back(shuffle_op); + } + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + // Add TFReaderOp + node_ops.push_back(tf_reader_op); + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h new file mode 100644 index 0000000000..5c97deebcb --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h @@ -0,0 +1,90 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TF_RECORD_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TF_RECORD_NODE_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +/// \class TFRecordNode +/// \brief A Dataset derived class to represent TFRecord dataset +class TFRecordNode : public Dataset { + public: + /// \brief Constructor + /// \note Parameter 'schema' is the path to the schema file + TFRecordNode(const std::vector &dataset_files, std::string schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_files_(dataset_files), + schema_path_(schema), + columns_list_(columns_list), + num_samples_(num_samples), + shuffle_(shuffle), + num_shards_(num_shards), + shard_id_(shard_id), + shard_equal_rows_(shard_equal_rows) {} + + /// \brief Constructor + /// \note Parameter 'schema' is shared pointer to Schema object + TFRecordNode(const std::vector &dataset_files, std::shared_ptr schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_files_(dataset_files), + schema_obj_(schema), + columns_list_(columns_list), + num_samples_(num_samples), + shuffle_(shuffle), + num_shards_(num_shards), + shard_id_(shard_id), + shard_equal_rows_(shard_equal_rows) {} + + /// \brief Destructor + ~TFRecordNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::vector dataset_files_; + std::string schema_path_; // schema_path_ path to schema file. It is set when type of schema parameter is string + std::shared_ptr schema_obj_; // schema_obj_ schema object. + std::vector columns_list_; + int64_t num_samples_; + ShuffleMode shuffle_; + int32_t num_shards_; + int32_t shard_id_; + bool shard_equal_rows_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TF_RECORD_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc new file mode 100644 index 0000000000..0e556bde86 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc @@ -0,0 +1,117 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + +#include +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/source/voc_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { +// Constructor for VOCNode +VOCNode::VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, + const std::map &class_indexing, bool decode, std::shared_ptr sampler, + std::shared_ptr cache) + : Dataset(std::move(cache)), + dataset_dir_(dataset_dir), + task_(task), + usage_(usage), + class_index_(class_indexing), + decode_(decode), + sampler_(sampler) {} + +Status VOCNode::ValidateParams() { + Path dir(dataset_dir_); + + RETURN_IF_NOT_OK(ValidateDatasetDirParam("VOCNode", dataset_dir_)); + + RETURN_IF_NOT_OK(ValidateDatasetSampler("VOCNode", sampler_)); + + if (task_ == "Segmentation") { + if (!class_index_.empty()) { + std::string err_msg = "VOCNode: class_indexing is invalid in Segmentation task."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + Path imagesets_file = dir / "ImageSets" / "Segmentation" / usage_ + ".txt"; + if (!imagesets_file.Exists()) { + std::string err_msg = "VOCNode: Invalid usage: " + usage_ + ", file does not exist"; + MS_LOG(ERROR) << "VOCNode: Invalid usage: " << usage_ << ", file \"" << imagesets_file << "\" does not exist!"; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } else if (task_ == "Detection") { + Path imagesets_file = dir / "ImageSets" / "Main" / usage_ + ".txt"; + if (!imagesets_file.Exists()) { + std::string err_msg = "VOCNode: Invalid usage: " + usage_ + ", file does not exist"; + MS_LOG(ERROR) << "VOCNode: Invalid usage: " << usage_ << ", file \"" << imagesets_file << "\" does not exist!"; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } else { + std::string err_msg = "VOCNode: Invalid task: " + task_; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + return Status::OK(); +} + +// Function to build VOCNode +std::vector> VOCNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + auto schema = std::make_unique(); + VOCOp::TaskType task_type_; + + if (task_ == "Segmentation") { + task_type_ = VOCOp::TaskType::Segmentation; + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnTarget), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + } else if (task_ == "Detection") { + task_type_ = VOCOp::TaskType::Detection; + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnLabel), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnDifficult), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + RETURN_EMPTY_IF_ERROR(schema->AddColumn( + ColDescriptor(std::string(kColumnTruncate), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1))); + } + + std::shared_ptr voc_op; + voc_op = std::make_shared(task_type_, usage_, dataset_dir_, class_index_, num_workers_, rows_per_buffer_, + connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build())); + RETURN_EMPTY_IF_ERROR(AddCacheOp(&node_ops)); + + node_ops.push_back(voc_op); + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h new file mode 100644 index 0000000000..7bde316663 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_VOC_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_VOC_NODE_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { +class VOCNode : public Dataset { + public: + /// \brief Constructor + VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, + const std::map &class_indexing, bool decode, std::shared_ptr sampler, + std::shared_ptr cache); + + /// \brief Destructor + ~VOCNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return shared pointer to the list of newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + const std::string kColumnImage = "image"; + const std::string kColumnTarget = "target"; + const std::string kColumnBbox = "bbox"; + const std::string kColumnLabel = "label"; + const std::string kColumnDifficult = "difficult"; + const std::string kColumnTruncate = "truncate"; + std::string dataset_dir_; + std::string task_; + std::string usage_; + std::map class_index_; + bool decode_; + std::shared_ptr sampler_; +}; +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_VOC_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h index 9f7027dc6a..6706878be5 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TAKE_NODE_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TAKE_NODE_H_ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_TAKE_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_TAKE_NODE_H_ #include #include @@ -51,4 +51,4 @@ class TakeNode : public Dataset { } // namespace api } // namespace dataset } // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_TAKE_NODE_H_ +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_TAKE_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc new file mode 100644 index 0000000000..b34f385530 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc @@ -0,0 +1,62 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/zip_op.h" + +#include "minddata/dataset/util/status.h" +namespace mindspore { +namespace dataset { +namespace api { + +ZipNode::ZipNode(const std::vector> &datasets) : datasets_(datasets) { + for (auto dataset : datasets_) { + this->children.push_back(dataset); + } +} + +Status ZipNode::ValidateParams() { + if (datasets_.empty()) { + std::string err_msg = "ZipNode: datasets to zip are not specified."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (find(datasets_.begin(), datasets_.end(), nullptr) != datasets_.end()) { + std::string err_msg = "ZipNode: zip datasets should not be null."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + return Status::OK(); +} + +std::vector> ZipNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + node_ops.push_back(std::make_shared(rows_per_buffer_, connector_que_size_)); + return node_ops; +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h new file mode 100644 index 0000000000..0073f66e92 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h @@ -0,0 +1,53 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_ZIP_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_ZIP_NODE_H_ + +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class ZipNode : public Dataset { + public: + /// \brief Constructor + explicit ZipNode(const std::vector> &datasets); + + /// \brief Destructor + ~ZipNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::vector> datasets_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_ZIP_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index dc0c1ebbb2..85b08cc1a1 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -106,13 +106,20 @@ class ZipNode; } \ } while (false) +Status AddShuffleOp(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows, + int32_t connector_que_size, int32_t rows_per_buffer, std::shared_ptr *shuffle_op); + +// Helper function to validate dataset files parameter +Status ValidateDatasetFilesParam(const std::string &dataset_name, const std::vector &dataset_files); + // Helper function to validate dataset num_shards and shard_id parameters Status ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_shards, int32_t shard_id); // Helper function to validate dataset sampler parameter Status ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr &sampler); -Status ValidateStringValue(const std::string &str, const std::unordered_set &valid_strings); +Status ValidateStringValue(const std::string &dataset_name, const std::string &str, + const std::unordered_set &valid_strings); // Helper function to validate dataset input/output column parameterCD - Status ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param, @@ -815,551 +822,8 @@ class SchemaObj { /* ####################################### Derived Dataset classes ################################# */ -// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS -// (In alphabetical order) - -class AlbumNode : public Dataset { - public: - /// \brief Constructor - AlbumNode(const std::string &dataset_dir, const std::string &data_schema, - const std::vector &column_names, bool decode, const std::shared_ptr &sampler); - - /// \brief Destructor - ~AlbumNode() = default; - - /// \brief a base class override function to create a runtime dataset op object from this class - /// \return shared pointer to the newly created DatasetOp - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string schema_path_; - std::vector column_names_; - bool decode_; - std::shared_ptr sampler_; -}; - -class CelebANode : public Dataset { - public: - /// \brief Constructor - CelebANode(const std::string &dataset_dir, const std::string &usage, const std::shared_ptr &sampler, - const bool &decode, const std::set &extensions, const std::shared_ptr &cache); - - /// \brief Destructor - ~CelebANode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return shared pointer to the list of newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string usage_; - bool decode_; - std::set extensions_; - std::shared_ptr sampler_; -}; -// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS -// (In alphabetical order) - -class Cifar10Node : public Dataset { - public: - /// \brief Constructor - Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, - std::shared_ptr cache); - - /// \brief Destructor - ~Cifar10Node() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string usage_; - std::shared_ptr sampler_; -}; - -class Cifar100Node : public Dataset { - public: - /// \brief Constructor - Cifar100Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, - std::shared_ptr cache); - - /// \brief Destructor - ~Cifar100Node() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string usage_; - std::shared_ptr sampler_; -}; - -/// \class CLUENode -/// \brief A Dataset derived class to represent CLUE dataset -class CLUENode : public Dataset { - public: - /// \brief Constructor - CLUENode(const std::vector dataset_files, std::string task, std::string usage, int64_t num_samples, - ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache); - - /// \brief Destructor - ~CLUENode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - /// \brief Split string based on a character delimiter - /// \return A string vector - std::vector split(const std::string &s, char delim); - - std::vector dataset_files_; - std::string task_; - std::string usage_; - int64_t num_samples_; - ShuffleMode shuffle_; - int32_t num_shards_; - int32_t shard_id_; -}; - -class CocoNode : public Dataset { - public: - /// \brief Constructor - CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, - const bool &decode, const std::shared_ptr &sampler, std::shared_ptr cache); - - /// \brief Destructor - ~CocoNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return shared pointer to the list of newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string annotation_file_; - std::string task_; - bool decode_; - std::shared_ptr sampler_; -}; - -/// \brief Record type for CSV -enum CsvType : uint8_t { INT = 0, FLOAT, STRING }; - -/// \brief Base class of CSV Record -class CsvBase { - public: - CsvBase() = default; - explicit CsvBase(CsvType t) : type(t) {} - virtual ~CsvBase() {} - CsvType type; -}; - -/// \brief CSV Record that can represent integer, float and string. -template -class CsvRecord : public CsvBase { - public: - CsvRecord() = default; - CsvRecord(CsvType t, T v) : CsvBase(t), value(v) {} - ~CsvRecord() {} - T value; -}; - -class CSVNode : public Dataset { - public: - /// \brief Constructor - CSVNode(const std::vector &dataset_files, char field_delim, - const std::vector> &column_defaults, const std::vector &column_names, - int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, - std::shared_ptr cache); - - /// \brief Destructor - ~CSVNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return shared pointer to the list of newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::vector dataset_files_; - char field_delim_; - std::vector> column_defaults_; - std::vector column_names_; - int64_t num_samples_; - ShuffleMode shuffle_; - int32_t num_shards_; - int32_t shard_id_; -}; - -#ifndef ENABLE_ANDROID -class ManifestNode : public Dataset { - public: - /// \brief Constructor - ManifestNode(const std::string &dataset_file, const std::string &usage, const std::shared_ptr &sampler, - const std::map &class_indexing, bool decode, std::shared_ptr cache); - - /// \brief Destructor - ~ManifestNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_file_; - std::string usage_; - bool decode_; - std::map class_index_; - std::shared_ptr sampler_; -}; -#endif - -#ifndef ENABLE_ANDROID -class MindDataNode : public Dataset { - public: - /// \brief Constructor - MindDataNode(const std::vector &dataset_files, const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded); - - /// \brief Constructor - MindDataNode(const std::string &dataset_file, const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, int64_t num_padded); - - /// \brief Destructor - ~MindDataNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - /// \brief Build sampler chain for minddata dataset - /// \return Status Status::OK() if input sampler is valid - Status BuildMindDatasetSamplerChain(const std::shared_ptr &sampler, - std::vector> *operators_, - int64_t num_padded); - - /// \brief Set sample_bytes when padded_sample has py::byte value - /// \note Pybind will use this function to set sample_bytes into MindDataNode - void SetSampleBytes(std::map *sample_bytes); - - private: - std::string dataset_file_; // search_for_pattern_ will be true in this mode - std::vector dataset_files_; // search_for_pattern_ will be false in this mode - bool search_for_pattern_; - std::vector columns_list_; - std::shared_ptr sampler_; - nlohmann::json padded_sample_; - std::map sample_bytes_; // enable in python - int64_t num_padded_; -}; -#endif - -class MnistNode : public Dataset { - public: - /// \brief Constructor - MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, - std::shared_ptr cache); - - /// \brief Destructor - ~MnistNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::string dataset_dir_; - std::string usage_; - std::shared_ptr sampler_; -}; - -class RandomNode : public Dataset { - public: - // Some constants to provide limits to random generation. - static constexpr int32_t kMaxNumColumns = 4; - static constexpr int32_t kMaxRank = 4; - static constexpr int32_t kMaxDimValue = 32; - - /// \brief Constructor - RandomNode(const int32_t &total_rows, std::shared_ptr schema, const std::vector &columns_list, - const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), - total_rows_(total_rows), - schema_path_(""), - schema_(std::move(schema)), - columns_list_(columns_list), - sampler_(std::move(sampler)) {} - - /// \brief Constructor - RandomNode(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, - const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), - total_rows_(total_rows), - schema_path_(schema_path), - columns_list_(columns_list), - sampler_(std::move(sampler)) {} - - /// \brief Destructor - ~RandomNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - /// \brief A quick inline for producing a random number between (and including) min/max - /// \param[in] min minimum number that can be generated. - /// \param[in] max maximum number that can be generated. - /// \return The generated random number - int32_t GenRandomInt(int32_t min, int32_t max); - - int32_t total_rows_; - std::string schema_path_; - std::shared_ptr schema_; - std::vector columns_list_; - std::shared_ptr sampler_; - std::mt19937 rand_gen_; -}; - -/// \class TextFileNode -/// \brief A Dataset derived class to represent TextFile dataset -class TextFileNode : public Dataset { - public: - /// \brief Constructor - TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, int32_t num_shards, - int32_t shard_id, std::shared_ptr cache); - - /// \brief Destructor - ~TextFileNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::vector dataset_files_; - int32_t num_samples_; - int32_t num_shards_; - int32_t shard_id_; - ShuffleMode shuffle_; -}; - -/// \class TFRecordNode -/// \brief A Dataset derived class to represent TFRecord dataset -class TFRecordNode : public Dataset { - public: - /// \brief Constructor - /// \note Parameter 'schema' is the path to the schema file - TFRecordNode(const std::vector &dataset_files, std::string schema, - const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, - int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_files_(dataset_files), - schema_path_(schema), - columns_list_(columns_list), - num_samples_(num_samples), - shuffle_(shuffle), - num_shards_(num_shards), - shard_id_(shard_id), - shard_equal_rows_(shard_equal_rows) {} - - /// \brief Constructor - /// \note Parameter 'schema' is shared pointer to Schema object - TFRecordNode(const std::vector &dataset_files, std::shared_ptr schema, - const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, - int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) - : Dataset(std::move(cache)), - dataset_files_(dataset_files), - schema_obj_(schema), - columns_list_(columns_list), - num_samples_(num_samples), - shuffle_(shuffle), - num_shards_(num_shards), - shard_id_(shard_id), - shard_equal_rows_(shard_equal_rows) {} - - /// \brief Destructor - ~TFRecordNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::vector dataset_files_; - std::string schema_path_; // schema_path_ path to schema file. It is set when type of schema parameter is string - std::shared_ptr schema_obj_; // schema_obj_ schema object. - std::vector columns_list_; - int64_t num_samples_; - ShuffleMode shuffle_; - int32_t num_shards_; - int32_t shard_id_; - bool shard_equal_rows_; -}; - -#ifndef ENABLE_ANDROID -class VOCNode : public Dataset { - public: - /// \brief Constructor - VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, - const std::map &class_indexing, bool decode, std::shared_ptr sampler, - std::shared_ptr cache); - - /// \brief Destructor - ~VOCNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return shared pointer to the list of newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - const std::string kColumnImage = "image"; - const std::string kColumnTarget = "target"; - const std::string kColumnBbox = "bbox"; - const std::string kColumnLabel = "label"; - const std::string kColumnDifficult = "difficult"; - const std::string kColumnTruncate = "truncate"; - std::string dataset_dir_; - std::string task_; - std::string usage_; - std::map class_index_; - bool decode_; - std::shared_ptr sampler_; -}; -#endif - -// DERIVED DATASET CLASSES FOR DATASET OPS -// (In alphabetical order) - -class MapNode : public Dataset { - public: - /// \brief Constructor - MapNode(std::shared_ptr child, std::vector> operations, - std::vector input_columns = {}, std::vector output_columns = {}, - const std::vector &columns = {}, std::shared_ptr cache = nullptr); - - /// \brief Destructor - ~MapNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::vector> operations_; - std::vector input_columns_; - std::vector output_columns_; - std::vector project_columns_; -}; - -class SkipNode : public Dataset { - public: - /// \brief Constructor - explicit SkipNode(std::shared_ptr child, int32_t count); - - /// \brief Destructor - ~SkipNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - int32_t skip_count_; -}; - -class ZipNode : public Dataset { - public: - /// \brief Constructor - explicit ZipNode(const std::vector> &datasets); - - /// \brief Destructor - ~ZipNode() = default; - - /// \brief a base class override function to create the required runtime dataset op objects for this class - /// \return The list of shared pointers to the newly created DatasetOps - std::vector> Build() override; - - /// \brief Parameters validation - /// \return Status Status::OK() if all the parameters are valid - Status ValidateParams() override; - - private: - std::vector> datasets_; -}; - } // namespace api } // namespace dataset } // namespace mindspore + #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_ diff --git a/mindspore/lite/minddata/CMakeLists.txt b/mindspore/lite/minddata/CMakeLists.txt index b5361adeac..d6dc3e5cce 100644 --- a/mindspore/lite/minddata/CMakeLists.txt +++ b/mindspore/lite/minddata/CMakeLists.txt @@ -81,18 +81,18 @@ AUX_SOURCE_DIRECTORY(${MINDDATA_DIR}/kernels/image/lite_cv MINDDATA_KERNELS_IMA if (BUILD_MINDDATA STREQUAL "full") - list(REMOVE_ITEM MINDDATA_API_SRC_FILES - "${MINDDATA_DIR}/api/text.cc" + list(REMOVE_ITEM MINDDATA_API_SRC_FILES + "${MINDDATA_DIR}/api/text.cc" "${MINDDATA_DIR}/api/de_tensor.cc" "${MINDDATA_DIR}/api/execute.cc" ) - - list(REMOVE_ITEM MINDDATA_CALLBACK_SRC_FILES - "${MINDDATA_DIR}/callback/py_ds_callback.cc" + + list(REMOVE_ITEM MINDDATA_CALLBACK_SRC_FILES + "${MINDDATA_DIR}/callback/py_ds_callback.cc" ) - + list(REMOVE_ITEM MINDDATA_KERNELS_SRC_FILES "${MINDDATA_DIR}/kernels/py_func_op.cc") - list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SRC_FILES + list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SRC_FILES "${MINDDATA_DIR}/engine/datasetops/build_sentence_piece_vocab_op.cc" "${MINDDATA_DIR}/engine/datasetops/filter_op.cc" "${MINDDATA_DIR}/engine/datasetops/barrier_op.cc" @@ -104,7 +104,7 @@ if (BUILD_MINDDATA STREQUAL "full") "${MINDDATA_DIR}/engine/datasetops/cache_op.cc" ) - list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SOURCE_SRC_FILES + list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SOURCE_SRC_FILES "${MINDDATA_DIR}/engine/datasetops/source/generator_op.cc" "${MINDDATA_DIR}/engine/datasetops/source/voc_op.cc" "${MINDDATA_DIR}/engine/datasetops/source/manifest_op.cc" @@ -131,6 +131,10 @@ if (BUILD_MINDDATA STREQUAL "full") list(REMOVE_ITEM MINDDATA_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES "${MINDDATA_DIR}/engine/ir/datasetops/source/generator_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/source/manifest_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/source/minddata_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/source/tf_record_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/source/voc_node.cc" ) list(REMOVE_ITEM MINDDATA_ENGINE_IR_DATASETOPS_SRC_FILES @@ -184,7 +188,7 @@ if (BUILD_MINDDATA STREQUAL "full") opencv_imgproc mindspore::json ) - + # ref: https://github.com/android/ndk/issues/1202 if (PLATFORM_ARM32) file(GLOB_RECURSE LIBCLANG_RT_LIB $ENV{ANDROID_NDK}/libclang_rt.builtins-arm-android.a) @@ -206,7 +210,7 @@ if (BUILD_MINDDATA STREQUAL "full") elseif (BUILD_MINDDATA STREQUAL "lite") list(REMOVE_ITEM MINDDATA_CORE_SRC_FILES "${MINDDATA_DIR}/core/client.cc") list(REMOVE_ITEM MINDDATA_KERNELS_SRC_FILES "${MINDDATA_DIR}/kernels/py_func_op.cc") - add_library(minddata_eager_mid OBJECT + add_library(minddata_eager_mid OBJECT ${MINDDATA_DIR}/api/de_tensor.cc ${MINDDATA_DIR}/api/execute.cc ) diff --git a/mindspore/lite/minddata/example/jni-example.cc b/mindspore/lite/minddata/example/jni-example.cc index 9be60716df..469a63c1e6 100644 --- a/mindspore/lite/minddata/example/jni-example.cc +++ b/mindspore/lite/minddata/example/jni-example.cc @@ -20,6 +20,7 @@ #include #include #include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" #include "minddata/dataset/util/path.h" #if defined(__ANDROID__) || defined(ANDROID) #include diff --git a/mindspore/lite/minddata/example/x86-example.cc b/mindspore/lite/minddata/example/x86-example.cc index 7458a81b2c..440e3c9bf7 100644 --- a/mindspore/lite/minddata/example/x86-example.cc +++ b/mindspore/lite/minddata/example/x86-example.cc @@ -19,6 +19,7 @@ #include #include #include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" #include "minddata/dataset/util/path.h" using Dataset = mindspore::dataset::api::Dataset; diff --git a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc index 166484feee..c6c0ee663d 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc @@ -16,6 +16,19 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc index b0905ca40f..cd50e8633c 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc @@ -16,6 +16,22 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_clue_test.cc b/tests/ut/cpp/dataset/c_api_dataset_clue_test.cc index 9f2940b8f4..35320dd0c6 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_clue_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_clue_test.cc @@ -18,10 +18,24 @@ #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" + using namespace mindspore::dataset::api; +using mindspore::dataset::GlobalContext; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Tensor; -using mindspore::dataset::GlobalContext; class MindDataTestPipeline : public UT::DatasetOpTesting { protected: @@ -49,11 +63,8 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetAFQMC) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence1"), row.end()); - std::vector expected_result = { - "蚂蚁借呗等额还款能否换成先息后本", - "蚂蚁花呗说我违约了", - "帮我看看本月花呗账单结清了没" - }; + std::vector expected_result = {"蚂蚁借呗等额还款能否换成先息后本", "蚂蚁花呗说我违约了", + "帮我看看本月花呗账单结清了没"}; uint64_t i = 0; while (row.size() != 0) { @@ -75,11 +86,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetAFQMC) { // test usage = "test"; - expected_result = { - "借呗取消的时间", - "网商贷用什么方法转变成借呗", - "我的借呗为什么开通不了" - }; + expected_result = {"借呗取消的时间", "网商贷用什么方法转变成借呗", "我的借呗为什么开通不了"}; ds = CLUE({test_file}, task, usage, 0, ShuffleMode::kFalse); EXPECT_NE(ds, nullptr); iter = ds->CreateIterator(); @@ -100,11 +107,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetAFQMC) { // eval usage = "eval"; - expected_result = { - "你有花呗吗", - "吃饭能用花呗吗", - "蚂蚁花呗支付金额有什么限制" - }; + expected_result = {"你有花呗吗", "吃饭能用花呗吗", "蚂蚁花呗支付金额有什么限制"}; ds = CLUE({eval_file}, task, usage, 0, ShuffleMode::kFalse); EXPECT_NE(ds, nullptr); iter = ds->CreateIterator(); @@ -179,11 +182,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetCMNLI) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence1"), row.end()); - std::vector expected_result = { - "你应该给这件衣服定一个价格。", - "我怎么知道他要说什么", - "向左。" - }; + std::vector expected_result = {"你应该给这件衣服定一个价格。", "我怎么知道他要说什么", "向左。"}; uint64_t i = 0; while (row.size() != 0) { @@ -224,11 +223,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetCSL) { iter->GetNextRow(&row); EXPECT_NE(row.find("abst"), row.end()); - std::vector expected_result = { - "这是一段长文本", - "这是一段长文本", - "这是一段长文本" - }; + std::vector expected_result = {"这是一段长文本", "这是一段长文本", "这是一段长文本"}; uint64_t i = 0; while (row.size() != 0) { @@ -337,11 +332,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetIFLYTEK) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence"), row.end()); - std::vector expected_result = { - "第一个文本", - "第二个文本", - "第三个文本" - }; + std::vector expected_result = {"第一个文本", "第二个文本", "第三个文本"}; uint64_t i = 0; while (row.size() != 0) { @@ -396,14 +387,12 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesA) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence1"), row.end()); - std::vector expected_result = { - "你有花呗吗", - "吃饭能用花呗吗", - "蚂蚁花呗支付金额有什么限制", - "蚂蚁借呗等额还款能否换成先息后本", - "蚂蚁花呗说我违约了", - "帮我看看本月花呗账单结清了没" - }; + std::vector expected_result = {"你有花呗吗", + "吃饭能用花呗吗", + "蚂蚁花呗支付金额有什么限制", + "蚂蚁借呗等额还款能否换成先息后本", + "蚂蚁花呗说我违约了", + "帮我看看本月花呗账单结清了没"}; uint64_t i = 0; while (row.size() != 0) { @@ -463,14 +452,12 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesB) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence1"), row.end()); - std::vector expected_result = { - "你有花呗吗", - "吃饭能用花呗吗", - "蚂蚁花呗支付金额有什么限制", - "蚂蚁借呗等额还款能否换成先息后本", - "蚂蚁花呗说我违约了", - "帮我看看本月花呗账单结清了没" - }; + std::vector expected_result = {"你有花呗吗", + "吃饭能用花呗吗", + "蚂蚁花呗支付金额有什么限制", + "蚂蚁借呗等额还款能否换成先息后本", + "蚂蚁花呗说我违约了", + "帮我看看本月花呗账单结清了没"}; uint64_t i = 0; while (row.size() != 0) { @@ -523,11 +510,8 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleGlobal) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence1"), row.end()); - std::vector expected_result = { - "蚂蚁花呗说我违约了", - "帮我看看本月花呗账单结清了没", - "蚂蚁借呗等额还款能否换成先息后本" - }; + std::vector expected_result = {"蚂蚁花呗说我违约了", "帮我看看本月花呗账单结清了没", + "蚂蚁借呗等额还款能否换成先息后本"}; uint64_t i = 0; while (row.size() != 0) { auto text = row["sentence1"]; @@ -572,11 +556,7 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetTNEWS) { iter->GetNextRow(&row); EXPECT_NE(row.find("sentence"), row.end()); - std::vector expected_result = { - "新闻1", - "新闻2", - "新闻3" - }; + std::vector expected_result = {"新闻1", "新闻2", "新闻3"}; uint64_t i = 0; while (row.size() != 0) { @@ -617,11 +597,8 @@ TEST_F(MindDataTestPipeline, TestCLUEDatasetWSC) { iter->GetNextRow(&row); EXPECT_NE(row.find("text"), row.end()); - std::vector expected_result = { - "小明呢,他在哪?", - "小红刚刚看到小明,他在操场", - "等小明回来,小张你叫他交作业" - }; + std::vector expected_result = {"小明呢,他在哪?", "小红刚刚看到小明,他在操场", + "等小明回来,小张你叫他交作业"}; uint64_t i = 0; while (row.size() != 0) { diff --git a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc index ec1c784b95..71cea0af52 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc @@ -16,10 +16,40 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + using namespace mindspore::dataset::api; +using mindspore::dataset::dsize_t; using mindspore::dataset::Tensor; using mindspore::dataset::TensorShape; -using mindspore::dataset::dsize_t; class MindDataTestPipeline : public UT::DatasetOpTesting { protected: @@ -79,12 +109,14 @@ TEST_F(MindDataTestPipeline, TestCocoDetection) { std::unordered_map> row; iter->GetNextRow(&row); - std::string expect_file[] = {"000000391895", "000000318219", "000000554625", "000000574769", "000000060623", - "000000309022"}; + std::string expect_file[] = {"000000391895", "000000318219", "000000554625", + "000000574769", "000000060623", "000000309022"}; std::vector> expect_bbox_vector = {{10.0, 10.0, 10.0, 10.0, 70.0, 70.0, 70.0, 70.0}, {20.0, 20.0, 20.0, 20.0, 80.0, 80.0, 80.0, 80.0}, - {30.0, 30.0, 30.0, 30.0}, {40.0, 40.0, 40.0, 40.0}, - {50.0, 50.0, 50.0, 50.0}, {60.0, 60.0, 60.0, 60.0}}; + {30.0, 30.0, 30.0, 30.0}, + {40.0, 40.0, 40.0, 40.0}, + {50.0, 50.0, 50.0, 50.0}, + {60.0, 60.0, 60.0, 60.0}}; std::vector> expect_catagoryid_list = {{1, 7}, {2, 8}, {3}, {4}, {5}, {6}}; uint64_t i = 0; while (row.size() != 0) { @@ -148,13 +180,13 @@ TEST_F(MindDataTestPipeline, TestCocoKeypoint) { iter->GetNextRow(&row); std::string expect_file[] = {"000000391895", "000000318219"}; - std::vector> expect_keypoint_vector = - {{368.0, 61.0, 1.0, 369.0, 52.0, 2.0, 0.0, 0.0, 0.0, 382.0, 48.0, 2.0, 0.0, 0.0, 0.0, 368.0, 84.0, 2.0, 435.0, - 81.0, 2.0, 362.0, 125.0, 2.0, 446.0, 125.0, 2.0, 360.0, 153.0, 2.0, 0.0, 0.0, 0.0, 397.0, 167.0, 1.0, 439.0, - 166.0, 1.0, 369.0, 193.0, 2.0, 461.0, 234.0, 2.0, 361.0, 246.0, 2.0, 474.0, 287.0, 2.0}, - {244.0, 139.0, 2.0, 0.0, 0.0, 0.0, 226.0, 118.0, 2.0, 0.0, 0.0, 0.0, 154.0, 159.0, 2.0, 143.0, 261.0, 2.0, 135.0, - 312.0, 2.0, 271.0, 423.0, 2.0, 184.0, 530.0, 2.0, 261.0, 280.0, 2.0, 347.0, 592.0, 2.0, 0.0, 0.0, 0.0, 123.0, - 596.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; + std::vector> expect_keypoint_vector = { + {368.0, 61.0, 1.0, 369.0, 52.0, 2.0, 0.0, 0.0, 0.0, 382.0, 48.0, 2.0, 0.0, 0.0, 0.0, 368.0, 84.0, 2.0, + 435.0, 81.0, 2.0, 362.0, 125.0, 2.0, 446.0, 125.0, 2.0, 360.0, 153.0, 2.0, 0.0, 0.0, 0.0, 397.0, 167.0, 1.0, + 439.0, 166.0, 1.0, 369.0, 193.0, 2.0, 461.0, 234.0, 2.0, 361.0, 246.0, 2.0, 474.0, 287.0, 2.0}, + {244.0, 139.0, 2.0, 0.0, 0.0, 0.0, 226.0, 118.0, 2.0, 0.0, 0.0, 0.0, 154.0, 159.0, 2.0, 143.0, 261.0, 2.0, + 135.0, 312.0, 2.0, 271.0, 423.0, 2.0, 184.0, 530.0, 2.0, 261.0, 280.0, 2.0, 347.0, 592.0, 2.0, 0.0, 0.0, 0.0, + 123.0, 596.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; std::vector> expect_size = {{1, 51}, {1, 51}}; std::vector> expect_num_keypoints_list = {{14}, {10}}; uint64_t i = 0; @@ -258,17 +290,17 @@ TEST_F(MindDataTestPipeline, TestCocoStuff) { std::unordered_map> row; iter->GetNextRow(&row); - std::string expect_file[] = {"000000391895", "000000318219", "000000554625", "000000574769", "000000060623", - "000000309022"}; - std::vector> expect_segmentation_vector = - {{10.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, - 70.0, 72.0, 73.0, 74.0, 75.0, -1.0, -1.0, -1.0, -1.0, -1.0}, - {20.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, - 10.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, -1.0}, - {40.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 40.0, 41.0, 42.0}, - {50.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}, - {60.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0}, - {60.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0}}; + std::string expect_file[] = {"000000391895", "000000318219", "000000554625", + "000000574769", "000000060623", "000000309022"}; + std::vector> expect_segmentation_vector = { + {10.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 70.0, 72.0, 73.0, 74.0, 75.0, -1.0, -1.0, -1.0, -1.0, -1.0}, + {20.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 10.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, -1.0}, + {40.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 40.0, 41.0, 42.0}, + {50.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}, + {60.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0}, + {60.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0}}; std::vector> expect_size = {{2, 10}, {2, 11}, {1, 12}, {1, 13}, {1, 14}, {2, 7}}; uint64_t i = 0; while (row.size() != 0) { diff --git a/tests/ut/cpp/dataset/c_api_dataset_config_test.cc b/tests/ut/cpp/dataset/c_api_dataset_config_test.cc index 4edd7aa0a1..1ab089f406 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_config_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_config_test.cc @@ -18,13 +18,17 @@ #include "minddata/dataset/include/config.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" -#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc index 28f862ded1..825ce7099d 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc @@ -18,10 +18,40 @@ #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + using namespace mindspore::dataset::api; +using mindspore::dataset::GlobalContext; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Tensor; -using mindspore::dataset::GlobalContext; class MindDataTestPipeline : public UT::DatasetOpTesting { protected: @@ -98,12 +128,8 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetMultiFiles) { iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); std::vector> expected_result = { - {"17", "18", "19", "20"}, - {"1", "2", "3", "4"}, - {"5", "6", "7", "8"}, - {"13", "14", "15", "16"}, - {"21", "22", "23", "24"}, - {"9", "10", "11", "12"}, + {"17", "18", "19", "20"}, {"1", "2", "3", "4"}, {"5", "6", "7", "8"}, + {"13", "14", "15", "16"}, {"21", "22", "23", "24"}, {"9", "10", "11", "12"}, }; uint64_t i = 0; @@ -148,10 +174,7 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetNumSamples) { std::unordered_map> row; iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); - std::vector> expected_result = { - {"1", "2", "3", "4"}, - {"5", "6", "7", "8"} - }; + std::vector> expected_result = {{"1", "2", "3", "4"}, {"5", "6", "7", "8"}}; uint64_t i = 0; while (row.size() != 0) { @@ -191,10 +214,7 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetDistribution) { std::unordered_map> row; iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); - std::vector> expected_result = { - {"1", "2", "3", "4"}, - {"5", "6", "7", "8"} - }; + std::vector> expected_result = {{"1", "2", "3", "4"}, {"5", "6", "7", "8"}}; uint64_t i = 0; while (row.size() != 0) { @@ -386,12 +406,8 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleFilesA) { iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); std::vector> expected_result = { - {"13", "14", "15", "16"}, - {"1", "2", "3", "4"}, - {"17", "18", "19", "20"}, - {"5", "6", "7", "8"}, - {"21", "22", "23", "24"}, - {"9", "10", "11", "12"}, + {"13", "14", "15", "16"}, {"1", "2", "3", "4"}, {"17", "18", "19", "20"}, + {"5", "6", "7", "8"}, {"21", "22", "23", "24"}, {"9", "10", "11", "12"}, }; uint64_t i = 0; @@ -445,12 +461,8 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleFilesB) { iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); std::vector> expected_result = { - {"13", "14", "15", "16"}, - {"1", "2", "3", "4"}, - {"17", "18", "19", "20"}, - {"5", "6", "7", "8"}, - {"21", "22", "23", "24"}, - {"9", "10", "11", "12"}, + {"13", "14", "15", "16"}, {"1", "2", "3", "4"}, {"17", "18", "19", "20"}, + {"5", "6", "7", "8"}, {"21", "22", "23", "24"}, {"9", "10", "11", "12"}, }; uint64_t i = 0; @@ -505,10 +517,7 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleGlobal) { iter->GetNextRow(&row); EXPECT_NE(row.find("col1"), row.end()); std::vector> expected_result = { - {"5", "6", "7", "8"}, - {"9", "10", "11", "12"}, - {"1", "2", "3", "4"} - }; + {"5", "6", "7", "8"}, {"9", "10", "11", "12"}, {"1", "2", "3", "4"}}; uint64_t i = 0; while (row.size() != 0) { diff --git a/tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc b/tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc index 1ee03f6eb4..06c14a3013 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc @@ -1,3 +1,4 @@ + /** * Copyright 2020 Huawei Technologies Co., Ltd * @@ -16,12 +17,21 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc index 029a6055f6..d94a233aca 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc @@ -16,6 +16,19 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc b/tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc index 47923b6bc1..d8c110e066 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc @@ -16,6 +16,20 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; @@ -57,7 +71,6 @@ TEST_F(MindDataTestPipeline, TestMindDataSuccess1) { iter->Stop(); } - TEST_F(MindDataTestPipeline, TestMindDataSuccess2) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess2 with a vector of single mindrecord file."; diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index a583a04258..5bb855a247 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -18,16 +18,22 @@ #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/vision.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" -#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" -#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes + +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" #include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" -#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc index 9a8444f0af..2267012123 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc @@ -20,11 +20,41 @@ #include "mindspore/core/ir/dtype/type_id.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + using namespace mindspore::dataset; using namespace mindspore::dataset::api; +using mindspore::dataset::DataType; using mindspore::dataset::Tensor; using mindspore::dataset::TensorShape; -using mindspore::dataset::DataType; class MindDataTestPipeline : public UT::DatasetOpTesting { protected: diff --git a/tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc b/tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc index ee08b4e047..1ee83dc799 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc @@ -14,10 +14,23 @@ * limitations under the License. */ #include "common/common.h" -#include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" + using namespace mindspore::dataset; using namespace mindspore::dataset::api; using mindspore::dataset::ShuffleMode; diff --git a/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc b/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc index 99e934e343..27c7785a49 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc @@ -18,12 +18,19 @@ #include "minddata/dataset/include/vision.h" #include "minddata/dataset/core/global_context.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" using namespace mindspore::dataset; using namespace mindspore::dataset::api; diff --git a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc index ab81d3667d..f09363f306 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc @@ -16,10 +16,24 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + using namespace mindspore::dataset::api; +using mindspore::dataset::DataType; using mindspore::dataset::Tensor; using mindspore::dataset::TensorShape; -using mindspore::dataset::DataType; class MindDataTestPipeline : public UT::DatasetOpTesting { protected: diff --git a/tests/ut/cpp/dataset/c_api_datasets_test.cc b/tests/ut/cpp/dataset/c_api_datasets_test.cc index af5826d30b..5e6e588852 100644 --- a/tests/ut/cpp/dataset/c_api_datasets_test.cc +++ b/tests/ut/cpp/dataset/c_api_datasets_test.cc @@ -16,13 +16,22 @@ #include "common/common.h" #include "minddata/dataset/include/datasets.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes + +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" #include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" using namespace mindspore::dataset::api; using mindspore::dataset::Tensor; diff --git a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc index 872a56d309..2a33af0262 100644 --- a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc @@ -25,6 +25,36 @@ #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/text.h" +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" + using namespace mindspore::dataset::api; using mindspore::dataset::DataType; using mindspore::dataset::ShuffleMode; @@ -304,8 +334,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail4) { // Create vocab from dataset // Expected failure: special tokens are already in the dataset - std::shared_ptr vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits::max()}, - std::numeric_limits::max(), {"world"}); + std::shared_ptr vocab = + ds->BuildVocab({"text"}, {0, std::numeric_limits::max()}, std::numeric_limits::max(), {"world"}); EXPECT_EQ(vocab, nullptr); } diff --git a/tests/ut/cpp/dataset/c_api_transforms_test.cc b/tests/ut/cpp/dataset/c_api_transforms_test.cc index 9193b37b4c..a586a4b57e 100644 --- a/tests/ut/cpp/dataset/c_api_transforms_test.cc +++ b/tests/ut/cpp/dataset/c_api_transforms_test.cc @@ -18,12 +18,35 @@ #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/vision.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/repeat_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/take_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/album_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h" using namespace mindspore::dataset::api; using mindspore::dataset::BorderType; diff --git a/tests/ut/cpp/dataset/c_api_vision_test.cc b/tests/ut/cpp/dataset/c_api_vision_test.cc index 50cb5b8a5f..be3e8ddb1f 100644 --- a/tests/ut/cpp/dataset/c_api_vision_test.cc +++ b/tests/ut/cpp/dataset/c_api_vision_test.cc @@ -18,13 +18,21 @@ #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/vision.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" #include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" #include "minddata/dataset/engine/ir/datasetops/project_node.h" #include "minddata/dataset/engine/ir/datasetops/rename_node.h" #include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" #include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" using namespace mindspore::dataset::api; using mindspore::dataset::BorderType; diff --git a/tests/ut/cpp/dataset/tree_adapter_test.cc b/tests/ut/cpp/dataset/tree_adapter_test.cc index 5b112089bb..f29dec7654 100644 --- a/tests/ut/cpp/dataset/tree_adapter_test.cc +++ b/tests/ut/cpp/dataset/tree_adapter_test.cc @@ -20,8 +20,22 @@ #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/transforms.h" -#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +// IR non-leaf nodes #include "minddata/dataset/engine/ir/datasetops/batch_node.h" +#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/concat_node.h" +#include "minddata/dataset/engine/ir/datasetops/map_node.h" +#include "minddata/dataset/engine/ir/datasetops/project_node.h" +#include "minddata/dataset/engine/ir/datasetops/rename_node.h" +#include "minddata/dataset/engine/ir/datasetops/shuffle_node.h" +#include "minddata/dataset/engine/ir/datasetops/skip_node.h" +#include "minddata/dataset/engine/ir/datasetops/zip_node.h" + +// IR leaf nodes + +#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" + using namespace mindspore::dataset; using mindspore::dataset::Tensor;