From 419478b410be15c2572f96fe93f4adadbd787b34 Mon Sep 17 00:00:00 2001 From: YangLuo Date: Wed, 26 Aug 2020 20:08:53 +0800 Subject: [PATCH] Change default value of sampler to seperate behaviour of default sampler & null sampler, add check for duplicate column name --- .../ccsrc/minddata/dataset/api/datasets.cc | 183 ++++++++++-------- .../ccsrc/minddata/dataset/include/datasets.h | 72 +++---- .../cpp/dataset/c_api_dataset_album_test.cc | 24 ++- .../cpp/dataset/c_api_dataset_cifar_test.cc | 30 +++ .../ut/cpp/dataset/c_api_dataset_coco_test.cc | 11 ++ .../ut/cpp/dataset/c_api_dataset_csv_test.cc | 11 ++ .../dataset/c_api_dataset_manifest_test.cc | 13 +- .../ut/cpp/dataset/c_api_dataset_ops_test.cc | 46 +++++ .../dataset/c_api_dataset_randomdata_test.cc | 26 ++- .../ut/cpp/dataset/c_api_dataset_voc_test.cc | 10 + tests/ut/cpp/dataset/c_api_datasets_test.cc | 57 +++++- 11 files changed, 356 insertions(+), 127 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 804a3ed6e9..ee14dc3cd6 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -201,8 +201,8 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, } // Function to create a ManifestDataset. -std::shared_ptr Manifest(std::string dataset_file, std::string usage, - std::shared_ptr sampler, +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, const std::map &class_indexing, bool decode) { auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode); @@ -590,13 +590,6 @@ bool SchemaObj::from_json(nlohmann::json json_obj) { // OTHER FUNCTIONS -// Helper function to create default RandomSampler. -std::shared_ptr CreateDefaultSampler() { - const int32_t num_samples = 0; // 0 means to sample all ids. - bool replacement = false; - return std::make_shared(replacement, num_samples); -} - // Helper function to compute a default shuffle size Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows, int64_t *shuffle_size) { @@ -692,6 +685,36 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha return true; } +// Helper function to validate dataset sampler parameter +bool ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr &sampler) { + if (sampler == nullptr) { + MS_LOG(ERROR) << dataset_name << ": Sampler is not constructed correctly, sampler: nullptr"; + return false; + } + return true; +} + +// Helper function to validate dataset input/output column parameter +bool ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param, + const std::vector &columns) { + if (columns.empty()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << " should not be empty"; + return false; + } + for (uint32_t i = 0; i < columns.size(); ++i) { + if (columns[i].empty()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << "[" << i << "] should not be empty"; + return false; + } + } + std::set columns_set(columns.begin(), columns.end()); + if (columns_set.size() != columns.size()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << ": Every column name should not be same with others"; + return false; + } + return true; +} + /* ####################################### Derived Dataset classes ################################# */ // DERIVED DATASET CLASSES LEAF-NODE DATASETS @@ -716,6 +739,16 @@ bool AlbumDataset::ValidateParams() { return false; } + if (!ValidateDatasetSampler("AlbumDataset", sampler_)) { + return false; + } + + if (!column_names_.empty()) { + if (!ValidateDatasetColumnParam("AlbumDataset", "column_names", column_names_)) { + return false; + } + } + return true; } @@ -724,11 +757,6 @@ std::vector> AlbumDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_)); @@ -754,6 +782,9 @@ bool CelebADataset::ValidateParams() { if (!ValidateDatasetDirParam("CelebADataset", dataset_dir_)) { return false; } + if (!ValidateDatasetSampler("CelebADataset", sampler_)) { + return false; + } std::set dataset_type_list = {"all", "train", "valid", "test"}; auto iter = dataset_type_list.find(dataset_type_); if (iter == dataset_type_list.end()) { @@ -768,11 +799,6 @@ std::vector> CelebADataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - std::unique_ptr schema = std::make_unique(); RETURN_EMPTY_IF_ERROR( schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); @@ -789,18 +815,15 @@ std::vector> CelebADataset::Build() { Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool Cifar10Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_); } +bool Cifar10Dataset::ValidateParams() { + return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && ValidateDatasetSampler("Cifar10Dataset", sampler_); +} // Function to build CifarOp for Cifar10 std::vector> Cifar10Dataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -818,18 +841,16 @@ std::vector> Cifar10Dataset::Build() { Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool Cifar100Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_); } +bool Cifar100Dataset::ValidateParams() { + return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) && + ValidateDatasetSampler("Cifar100Dataset", sampler_); +} // Function to build CifarOp for Cifar100 std::vector> Cifar100Dataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -1045,6 +1066,9 @@ bool CocoDataset::ValidateParams() { if (!ValidateDatasetDirParam("CocoDataset", dataset_dir_)) { return false; } + if (!ValidateDatasetSampler("CocoDataset", sampler_)) { + return false; + } Path annotation_file(annotation_file_); if (!annotation_file.Exists()) { MS_LOG(ERROR) << "annotation_file is invalid or not exist"; @@ -1064,11 +1088,6 @@ std::vector> CocoDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - CocoOp::TaskType task_type; if (task_ == "Detection") { task_type = CocoOp::TaskType::Detection; @@ -1158,6 +1177,12 @@ bool CSVDataset::ValidateParams() { return false; } + if (!column_names_.empty()) { + if (!ValidateDatasetColumnParam("CSVDataset", "column_names", column_names_)) { + return false; + } + } + return true; } @@ -1218,17 +1243,15 @@ ImageFolderDataset::ImageFolderDataset(std::string dataset_dir, bool decode, std class_indexing_(class_indexing), exts_(extensions) {} -bool ImageFolderDataset::ValidateParams() { return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_); } +bool ImageFolderDataset::ValidateParams() { + return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_) && + ValidateDatasetSampler("ImageFolderDataset", sampler_); +} std::vector> ImageFolderDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. // This arg is exist in ImageFolderOp, but not externalized (in Python API). std::unique_ptr schema = std::make_unique(); @@ -1243,7 +1266,8 @@ std::vector> ImageFolderDataset::Build() { return node_ops; } -ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, +ManifestDataset::ManifestDataset(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, const std::map &class_indexing, bool decode) : dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {} @@ -1254,6 +1278,10 @@ bool ManifestDataset::ValidateParams() { return false; } + if (!ValidateDatasetSampler("ManifestDataset", sampler_)) { + return false; + } + std::vector usage_list = {"train", "eval", "inference"}; if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) { MS_LOG(ERROR) << "usage should be train, eval or inference."; @@ -1267,11 +1295,6 @@ std::vector> ManifestDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -1291,17 +1314,14 @@ std::vector> ManifestDataset::Build() { MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool MnistDataset::ValidateParams() { return ValidateDatasetDirParam("MnistDataset", dataset_dir_); } +bool MnistDataset::ValidateParams() { + return ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_); +} std::vector> MnistDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -1320,6 +1340,14 @@ bool RandomDataset::ValidateParams() { MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_; return false; } + if (!ValidateDatasetSampler("RandomDataset", sampler_)) { + return false; + } + if (!columns_list_.empty()) { + if (!ValidateDatasetColumnParam("RandomDataset", "columns_list", columns_list_)) { + return false; + } + } return true; } @@ -1342,11 +1370,6 @@ std::vector> RandomDataset::Build() { total_rows_ = schema_obj->get_num_rows(); } - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - std::string schema_json_string, schema_file_path; if (schema_ != nullptr) { schema_->set_dataset_type("Random"); @@ -1459,6 +1482,9 @@ bool VOCDataset::ValidateParams() { MS_LOG(ERROR) << "Invalid dataset path or no dataset path is specified."; return false; } + if (!ValidateDatasetSampler("VOCDataset", sampler_)) { + return false; + } if (task_ == "Segmentation") { if (!class_index_.empty()) { MS_LOG(ERROR) << "class_indexing is invalid in Segmentation task."; @@ -1487,11 +1513,6 @@ std::vector> VOCDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - auto schema = std::make_unique(); VOCOp::TaskType task_type_; @@ -1657,7 +1678,21 @@ bool MapDataset::ValidateParams() { MS_LOG(ERROR) << "Map: No operation is specified."; return false; } - + if (!input_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "input_columns", input_columns_)) { + return false; + } + } + if (!output_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "output_columns", output_columns_)) { + return false; + } + } + if (!project_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "project_columns", project_columns_)) { + return false; + } + } return true; } @@ -1686,23 +1721,13 @@ RenameDataset::RenameDataset(const std::vector &input_columns, : input_columns_(input_columns), output_columns_(output_columns) {} bool RenameDataset::ValidateParams() { - if (input_columns_.empty() || output_columns_.empty()) { - MS_LOG(ERROR) << "input and output columns must be specified"; - return false; - } if (input_columns_.size() != output_columns_.size()) { - MS_LOG(ERROR) << "input and output columns must be the same size"; + MS_LOG(ERROR) << "RenameDataset: input and output columns must be the same size"; return false; } - for (uint32_t i = 0; i < input_columns_.size(); ++i) { - if (input_columns_[i].empty()) { - MS_LOG(ERROR) << "input_columns: column name should not be empty."; - return false; - } - if (output_columns_[i].empty()) { - MS_LOG(ERROR) << "output_columns: column name should not be empty."; - return false; - } + if (!ValidateDatasetColumnParam("RenameDataset", "input_columns", input_columns_) || + !ValidateDatasetColumnParam("RenameDataset", "output_columns", output_columns_)) { + return false; } return true; } diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index 8564827b4d..fb9ea146d8 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -87,44 +87,44 @@ std::shared_ptr Schema(const std::string &schema_file = ""); /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. /// (default = {}) /// \param[in] decode the option to decode the images in dataset (default = false) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names = {}, bool decode = false, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a CelebADataset /// \notes The generated dataset has two columns ['image', 'attr']. // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. /// \param[in] dataset_dir Path to the root directory that contains the dataset. /// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'. -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] decode Decode the images after reading (default=false). /// \param[in] extensions Set of file extensions to be included in the dataset (default={}). /// \return Shared pointer to the current Dataset std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &dataset_type = "all", - const std::shared_ptr &sampler = nullptr, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), bool decode = false, const std::set &extensions = {}); /// \brief Function to create a Cifar10 Dataset /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Cifar10(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a Cifar100 Dataset /// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Cifar100(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a CLUEDataset /// \notes The generated dataset has a variable number of columns depending on the task and usage @@ -161,12 +161,12 @@ std::shared_ptr CLUE(const std::vector &dataset_files, /// \param[in] annotation_file Path to the annotation json /// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' /// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", const bool &decode = false, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a CSVDataset /// \notes The generated dataset has a variable number of columns @@ -200,13 +200,13 @@ std::shared_ptr CSV(const std::vector &dataset_files, c /// The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] decode A flag to decode in ImageFolder -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] extensions File extensions to be read /// \param[in] class_indexing a class name to label map /// \return Shared pointer to the current ImageFolderDataset std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode = false, - const std::shared_ptr &sampler = nullptr, + const std::shared_ptr &sampler = RandomSampler(), const std::set &extensions = {}, const std::map &class_indexing = {}); @@ -214,25 +214,25 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_file The dataset file to be read /// \param[in] usage Need "train", "eval" or "inference" data (default="train") -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder /// names will be sorted alphabetically and each class will be given a unique index starting from 0). /// \param[in] decode Decode the images after reading (default=false). /// \return Shared pointer to the current ManifestDataset -std::shared_ptr Manifest(std::string dataset_file, std::string usage = "train", - std::shared_ptr sampler = nullptr, +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage = "train", + const std::shared_ptr &sampler = RandomSampler(), const std::map &class_indexing = {}, bool decode = false); /// \brief Function to create a MnistDataset /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current MnistDataset std::shared_ptr Mnist(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a ConcatDataset /// \notes Reload "+" operator to concat two datasets @@ -246,14 +246,14 @@ std::shared_ptr operator+(const std::shared_ptr &dataset /// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random) /// \param[in] schema SchemaObj to set column type, data type and data shape /// \param[in] columns_list List of columns to be read (default={}, read all columns) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset template > std::shared_ptr RandomData(const int32_t &total_rows = 0, T schema = nullptr, const std::vector &columns_list = {}, - std::shared_ptr sampler = nullptr) { - auto ds = std::make_shared(total_rows, schema, std::move(columns_list), std::move(sampler)); + const std::shared_ptr &sampler = RandomSampler()) { + auto ds = std::make_shared(total_rows, schema, columns_list, std::move(sampler)); return ds->ValidateParams() ? ds : nullptr; } @@ -286,13 +286,13 @@ std::shared_ptr TextFile(const std::vector &datase /// \param[in] mode Set the data list txt file to be readed /// \param[in] class_indexing A str-to-int mapping from label name to index /// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", const std::string &mode = "train", const std::map &class_indexing = {}, bool decode = false, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a ZipDataset /// \notes Applies zip to the dataset @@ -756,7 +756,7 @@ class ImageFolderDataset : public Dataset { class ManifestDataset : public Dataset { public: /// \brief Constructor - ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, + ManifestDataset(const std::string &dataset_file, const std::string &usage, const std::shared_ptr &sampler, const std::map &class_indexing, bool decode); /// \brief Destructor @@ -808,7 +808,7 @@ class RandomDataset : public Dataset { /// \brief Constructor RandomDataset(const int32_t &total_rows, std::shared_ptr schema, - const std::vector &columns_list, std::shared_ptr sampler) + const std::vector &columns_list, const std::shared_ptr &sampler) : total_rows_(total_rows), schema_path_(""), schema_(std::move(schema)), @@ -816,8 +816,8 @@ class RandomDataset : public Dataset { sampler_(std::move(sampler)) {} /// \brief Constructor - RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector columns_list, - std::shared_ptr sampler) + RandomDataset(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, + const std::shared_ptr &sampler) : total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {} /// \brief Destructor diff --git a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc index 820909d8e8..166484feee 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc @@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestAlbumDecode) { TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers."; - + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; std::vector column_names = {"image", "label", "id"}; @@ -134,3 +134,25 @@ TEST_F(MindDataTestPipeline, TestAlbumError) { EXPECT_EQ(ds, nullptr); } + +TEST_F(MindDataTestPipeline, TestAlbumWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumWithNullSampler."; + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names, true, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestAlbumDuplicateColumnName) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDuplicateColumnName."; + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "image", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names, true); + // Expect failure: duplicate column names + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc index 2e4125516d..1c473f6c81 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc @@ -107,3 +107,33 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) { std::shared_ptr ds = Cifar10("", RandomSampler(false, 10)); EXPECT_EQ(ds, nullptr); } + +TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; + std::shared_ptr ds = Cifar10(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithNullSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = Cifar100(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithWrongSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = Cifar100(folder_path, RandomSampler(false, -10)); + // Expect failure: sampler is not construnced correctly + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc index dcc21e253b..ec1c784b95 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc @@ -290,3 +290,14 @@ TEST_F(MindDataTestPipeline, TestCocoStuff) { // Manually terminate the pipeline iter->Stop(); } + +TEST_F(MindDataTestPipeline, TestCocoWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCocoWithNullSampler."; + // Create a Coco Dataset + std::string folder_path = datasets_root_path_ + "/testCOCO/train"; + std::string annotation_file = datasets_root_path_ + "/testCOCO/annotations/train.json"; + + std::shared_ptr ds = Coco(folder_path, annotation_file, "Detection", false, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc index 36fb1d13af..52aee05be6 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc @@ -533,3 +533,14 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleGlobal) { GlobalContext::config_manager()->set_seed(original_seed); GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers); } + +TEST_F(MindDataTestPipeline, TestCSVDatasetDuplicateColumnName) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCSVDatasetDuplicateColumnName."; + + // Create a CSVDataset, with single CSV file + std::string train_file = datasets_root_path_ + "/testCSV/1.csv"; + std::vector column_names = {"col1", "col1", "col3", "col4"}; + std::shared_ptr ds = CSV({train_file}, ',', {}, column_names, -1, ShuffleMode::kFalse); + // Expect failure: duplicate column names + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc index 5e4c91c765..5911279d02 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc @@ -59,7 +59,7 @@ TEST_F(MindDataTestPipeline, TestManifestDecode) { std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; // Create a Manifest Dataset - std::shared_ptr ds = Manifest(file_path, "train", nullptr, {}, true); + std::shared_ptr ds = Manifest(file_path, "train", RandomSampler(), {}, true); EXPECT_NE(ds, nullptr); // Create an iterator over the result of the above dataset @@ -130,7 +130,7 @@ TEST_F(MindDataTestPipeline, TestManifestClassIndex) { std::vector expected_label = {111, 222}; // Create a Manifest Dataset - std::shared_ptr ds = Manifest(file_path, "train", nullptr, map, true); + std::shared_ptr ds = Manifest(file_path, "train", RandomSampler(), map, true); EXPECT_NE(ds, nullptr); // Create an iterator over the result of the above dataset @@ -204,3 +204,12 @@ TEST_F(MindDataTestPipeline, TestManifestError) { std::shared_ptr ds1 = Manifest(file_path, "invalid_usage"); EXPECT_EQ(ds1, nullptr); } + +TEST_F(MindDataTestPipeline, TestManifestWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestWithNullSampler."; + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "train", nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index 48b2f71669..818189e6e2 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -311,6 +311,34 @@ TEST_F(MindDataTestPipeline, TestProjectMap) { iter->Stop(); } +TEST_F(MindDataTestPipeline, TestMapDuplicateColumn) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMapDuplicateColumn."; + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create objects for the tensor ops + std::shared_ptr random_vertical_flip_op = vision::RandomVerticalFlip(0.5); + EXPECT_NE(random_vertical_flip_op, nullptr); + + // Create a Map operation on ds + auto ds1 = ds->Map({random_vertical_flip_op}, {"image", "image"}, {}, {}); + // Expect failure: duplicate input column name + EXPECT_EQ(ds1, nullptr); + + // Create a Map operation on ds + auto ds2 = ds->Map({random_vertical_flip_op}, {}, {"label", "label"}, {}); + // Expect failure: duplicate output column name + EXPECT_EQ(ds2, nullptr); + + // Create a Map operation on ds + auto ds3 = ds->Map({random_vertical_flip_op}, {}, {}, {"image", "image"}); + // Expect failure: duplicate project column name + EXPECT_EQ(ds3, nullptr); +} + TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) { MS_LOG(INFO) << "Doing MindDataTestPipeline.TestProjectMapAutoInjection"; @@ -395,6 +423,24 @@ TEST_F(MindDataTestPipeline, TestRenameFail2) { EXPECT_EQ(ds, nullptr); } +TEST_F(MindDataTestPipeline, TestRenameFail3) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail3."; + // We expect this test to fail because duplicate column name + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a Rename operation on ds + auto ds1 = ds->Rename({"image", "image"}, {"col1", "col2"}); + EXPECT_EQ(ds1, nullptr); + + // Create a Rename operation on ds + auto ds2 = ds->Rename({"image", "label"}, {"col1", "col1"}); + EXPECT_EQ(ds2, nullptr); +} + TEST_F(MindDataTestPipeline, TestRenameSuccess) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameSuccess."; diff --git a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc index 22e77a2ddc..0506a58134 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc @@ -265,4 +265,28 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) { // Manually terminate the pipeline iter->Stop(); GlobalContext::config_manager()->set_seed(curr_seed); -} \ No newline at end of file +} + +TEST_F(MindDataTestPipeline, TestRandomDatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetWithNullSampler."; + + // Create a RandomDataset + std::shared_ptr schema = Schema(); + schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2}); + schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); + std::shared_ptr ds = RandomData(50, schema, {}, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetDuplicateColumnName."; + + // Create a RandomDataset + std::shared_ptr schema = Schema(); + schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2}); + schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); + std::shared_ptr ds = RandomData(50, schema, {"image", "image"}); + // Expect failure: duplicate column names + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc index 17fa23198a..ab81d3667d 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc @@ -194,3 +194,13 @@ TEST_F(MindDataTestPipeline, TestVOCSegmentationError1) { // Expect nullptr for segmentation task with class_index EXPECT_EQ(ds, nullptr); } + +TEST_F(MindDataTestPipeline, TestVOCWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVOCWithNullSampler."; + + // Create a VOC Dataset + std::string folder_path = datasets_root_path_ + "/testVOC2012_2"; + std::shared_ptr ds = VOC(folder_path, "Segmentation", "train", {}, false, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_datasets_test.cc b/tests/ut/cpp/dataset/c_api_datasets_test.cc index 1ae562a618..2952222d72 100644 --- a/tests/ut/cpp/dataset/c_api_datasets_test.cc +++ b/tests/ut/cpp/dataset/c_api_datasets_test.cc @@ -118,24 +118,44 @@ TEST_F(MindDataTestPipeline, TestCelebAException) { EXPECT_EQ(ds1, nullptr); } -TEST_F(MindDataTestPipeline, TestImageFolderFail1) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail1."; +TEST_F(MindDataTestPipeline, TestCelebADatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCelebADataset."; - // Create an ImageFolder Dataset - std::shared_ptr ds = ImageFolder("", true, nullptr); + // Create a CelebA Dataset + std::string folder_path = datasets_root_path_ + "/testCelebAData/"; + std::shared_ptr ds = CelebA(folder_path, "all", nullptr, false, {}); + // Expect failure: sampler can not be nullptr EXPECT_EQ(ds, nullptr); } -TEST_F(MindDataTestPipeline, TestMnistFail1) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFail1."; +TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir."; // Create a Mnist Dataset std::shared_ptr ds = Mnist("", RandomSampler(false, 10)); EXPECT_EQ(ds, nullptr); } -TEST_F(MindDataTestPipeline, TestImageFolderFail2) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail2."; +TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithNullSampler."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderWithWrongDatasetDir) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderWithWrongDatasetDir."; + + // Create an ImageFolder Dataset + std::shared_ptr ds = ImageFolder("", true, nullptr); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongExtension) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongExtension."; // Create an ImageFolder Dataset std::string folder_path = datasets_root_path_ + "/testPK/data/"; @@ -150,8 +170,29 @@ TEST_F(MindDataTestPipeline, TestImageFolderFail2) { // Iterate the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); + // Expect no data: can not find files with specified extension EXPECT_EQ(row.size(), 0); // Manually terminate the pipeline iter->Stop(); } + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithNullSampler."; + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, SequentialSampler(-2, 5)); + // Expect failure: sampler is not construnced correctly + EXPECT_EQ(ds, nullptr); +}