diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/bindings.cc index dcde4071a6..e3de11d813 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/bindings.cc @@ -91,11 +91,17 @@ PYBIND_REGISTER(CocoOp, 1, ([](const py::module *m) { PYBIND_REGISTER(ImageFolderOp, 1, ([](const py::module *m) { (void)py::class_>(*m, "ImageFolderOp") - .def_static("get_num_rows_and_classes", [](const std::string &path) { - int64_t count = 0, num_classes = 0; - THROW_IF_ERROR( - ImageFolderOp::CountRowsAndClasses(path, std::set{}, &count, &num_classes)); - return py::make_tuple(count, num_classes); + .def_static("get_num_rows", + [](const std::string &path) { + int64_t count = 0; + THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, &count, nullptr, {})); + return count; + }) + .def_static("get_num_classes", [](const std::string &path, + const std::map class_index) { + int64_t num_classes = 0; + THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, nullptr, &num_classes, class_index)); + return num_classes; }); })); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc index 478883ffd5..a3588c1146 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc @@ -15,7 +15,7 @@ */ #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" #include -#include +#include #include "utils/ms_utils.h" #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/tensor_shape.h" @@ -280,7 +280,7 @@ Status ImageFolderOp::GetClassIds(std::map> *cls_i RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not."); } else { RETURN_STATUS_UNEXPECTED( - "Map for storaging image-index pair is nullptr or has been set in other place," + "Map containing image-index pair is nullptr or has been set in other place," "it must be empty before using GetClassIds."); } } @@ -294,14 +294,14 @@ Status ImageFolderOp::GetClassIds(std::map> *cls_i } // Worker Entry for pre-scanning all the folders and do the 1st level shuffle -// Worker pull a file name from mFoldernameQueue (which is a Queue), walks all the images under that foldername +// Worker pull a file name from folder_name_queue_ (which is a Queue), walks all the images under that foldername // After walking is complete, sort all the file names (relative path to all jpeg files under the same directory ) // (Sort is automatically conducted using a set which is implemented using a Red-Black Tree) // Add the sorted filenames in to a queue. The make a pair (foldername, queue*), // foldername is used for 2nd level sorting. // FYI: 1st level sorting: sort all images under the same directory. // FYI: 2nd level sorting: sort all folder names -// push this pair to mImagenameQueue (which is again a Queue) +// push this pair to image_name_queue (which is again a Queue) Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { TaskManager::FindMe()->Post(); std::string folder_name; @@ -334,7 +334,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { return Status::OK(); } -// This helper function recursively walks all foldernames, and send each foldername to mFoldernameQueue +// This helper function recursively walks all folder_paths, and send each foldername to folder_name_queue_ // if mRecursive == false, don't go into folder of folders Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { std::shared_ptr dir_itr = Path::DirIterator::OpenDirectory(dir); @@ -355,7 +355,7 @@ Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { } // A thread that calls RecursiveWalkFolder -Status ImageFolderOp::startAsyncWalk() { +Status ImageFolderOp::StartAsyncWalk() { TaskManager::FindMe()->Post(); Path dir(folder_path_); if (dir.Exists() == false || dir.IsDirectory() == false) { @@ -363,8 +363,8 @@ Status ImageFolderOp::startAsyncWalk() { } dirname_offset_ = folder_path_.length(); RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir)); - // send out num_workers_ end signal to mFoldernameQueue, 1 for each worker. - // Upon receiving end Signal, worker quits and set another end Signal to mImagenameQueue. + // send out num_workers_ end signal to folder_name_queue_, 1 for each worker. + // Upon receiving end Signal, worker quits and set another end Signal to image_name_queue. for (int32_t ind = 0; ind < num_workers_; ++ind) { RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal } @@ -372,19 +372,17 @@ Status ImageFolderOp::startAsyncWalk() { } Status ImageFolderOp::LaunchThreadsAndInitOp() { - if (tree_ == nullptr) { - RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set."); - } + RETURN_UNEXPECTED_IF_NULL(tree_); // Registers QueueList and individual Queues for interrupt services RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks())); RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks())); RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks())); // The following code launch 3 threads group - // 1) A thread that walks all folders and push the folder names to a util:Queue mFoldernameQueue. - // 2) Workers that pull foldername from mFoldernameQueue, walk it and return the sorted images to mImagenameQueue + // 1) A thread that walks all folders and push the folder names to a util:Queue folder_name_queue_. + // 2) Workers that pull foldername from folder_name_queue_, walk it and return the sorted images to image_name_queue // 3) Launch main workers that load DataBuffers by reading all images - RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::startAsyncWalk, this))); + RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::StartAsyncWalk, this))); RETURN_IF_NOT_OK( tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1))); RETURN_IF_NOT_OK( @@ -397,42 +395,53 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() { } Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set &exts, int64_t *num_rows, - int64_t *num_classes, int64_t dev_id, int64_t num_dev) { + int64_t *num_classes, std::map class_index) { Path dir(path); std::string err_msg = ""; int64_t row_cnt = 0; err_msg += (dir.Exists() == false || dir.IsDirectory() == false) - ? "Invalid parameter, image folde path is invalid or not set, path: " + path + ? "Invalid parameter, image folder path is invalid or not set, path: " + path : ""; err_msg += - (num_classes == nullptr || num_rows == nullptr) ? "Invalid parameter, num_class or num_rows cannot be null.\n" : ""; - err_msg += (dev_id >= num_dev || num_dev <= 0) - ? "Invalid parameter, num_shard must be greater than shard_id and greater than 0, got num_shard: " + - std::to_string(num_dev) + ", shard_id: " + std::to_string(dev_id) + ".\n" - : ""; + (num_classes == nullptr && num_rows == nullptr) ? "Invalid parameter, num_class and num_rows are null.\n" : ""; if (err_msg.empty() == false) { RETURN_STATUS_UNEXPECTED(err_msg); } - std::queue foldernames; + std::queue folder_paths; std::shared_ptr dir_itr = Path::DirIterator::OpenDirectory(&dir); + std::unordered_set folder_names; while (dir_itr->hasNext()) { Path subdir = dir_itr->next(); if (subdir.IsDirectory()) { - foldernames.push(subdir.toString()); + folder_paths.push(subdir.toString()); + if (!class_index.empty()) folder_names.insert(subdir.Basename()); } } - (*num_classes) = foldernames.size(); - while (foldernames.empty() == false) { - Path subdir(foldernames.front()); + if (num_classes != nullptr) { + // if class index is empty, get everything on disk + if (class_index.empty()) { + *num_classes = folder_paths.size(); + } else { + for (const auto &p : class_index) { + CHECK_FAIL_RETURN_UNEXPECTED(folder_names.find(p.first) != folder_names.end(), + "folder: " + p.first + " doesn't exist in " + path + " ."); + } + (*num_classes) = class_index.size(); + } + } + // return here if only num_class is needed + RETURN_OK_IF_TRUE(num_rows == nullptr); + while (folder_paths.empty() == false) { + Path subdir(folder_paths.front()); dir_itr = Path::DirIterator::OpenDirectory(&subdir); while (dir_itr->hasNext()) { if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) { ++row_cnt; } } - foldernames.pop(); + folder_paths.pop(); } - (*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1); + (*num_rows) = row_cnt; return Status::OK(); } @@ -460,9 +469,12 @@ Status ImageFolderOp::GetDatasetSize(int64_t *dataset_size) { *dataset_size = dataset_size_; return Status::OK(); } - int64_t sample_size, num_rows, num_classes; + int64_t sample_size, num_rows; num_rows = num_rows_; - if (num_rows_ <= 0) RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, &num_classes)); + if (num_rows_ <= 0) { + // GetDatasetSize will not be impacted by class_index_ + RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, nullptr, {})); + } sample_size = sampler_->GetNumSamples(); *dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows; dataset_size_ = *dataset_size; @@ -475,8 +487,7 @@ Status ImageFolderOp::GetNumClasses(int64_t *num_classes) { *num_classes = num_classes_; return Status::OK(); } - int64_t num_rows = num_rows_; - RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, num_classes)); + RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, nullptr, num_classes, class_index_)); num_classes_ = *num_classes; return Status::OK(); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.h index 979642ecce..bd3e1d694c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.h @@ -205,7 +205,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp { // returned by this function may not be consistent with what image_folder_op is going to return // user this at your own risk! static Status CountRowsAndClasses(const std::string &path, const std::set &exts, int64_t *num_rows, - int64_t *num_classes, int64_t dev_id = 0, int64_t num_dev = 1); + int64_t *num_classes, std::map class_index); // Base-class override for NodePass visitor acceptor. // @param p - Pointer to the NodePass to be accepted. @@ -251,7 +251,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp { // start walking of all dirs // @return - Status startAsyncWalk(); + Status StartAsyncWalk(); // Called first when function is called // @return diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc index 705b12d9fa..b0e716e649 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc @@ -81,14 +81,16 @@ std::vector> BatchNode::Build() { std::vector> node_ops; #ifdef ENABLE_PYTHON - node_ops.push_back(std::make_shared(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, - in_col_names_, out_col_names_, batch_size_func_, batch_map_func_, - pad_map_)); - // need to insert a project when per_batch_func changes the number of columns + // if col_order_ isn't empty, then a project node needs to be attached after batch node. (same as map) + // this means project_node needs to be the parent of batch_node. this means node_ops = [project_node, batch_node] if (!col_order_.empty()) { auto project_op = std::make_shared(col_order_); node_ops.push_back(project_op); } + + node_ops.push_back(std::make_shared(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, + in_col_names_, out_col_names_, batch_size_func_, batch_map_func_, + pad_map_)); #else node_ops.push_back(std::make_shared(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, in_col_names_, pad_map_)); diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index b210b5dca1..ebd4e9a593 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -2891,7 +2891,7 @@ class ImageFolderDataset(MappableDataset): Number, number of batches. """ if self.dataset_size is None: - num_rows = ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[0] + num_rows = ImageFolderOp.get_num_rows(self.dataset_dir) self.dataset_size = get_num_rows(num_rows, self.num_shards) rows_from_sampler = self._get_sampler_dataset_size() if rows_from_sampler is not None and rows_from_sampler < self.dataset_size: @@ -2905,7 +2905,8 @@ class ImageFolderDataset(MappableDataset): Return: Number, number of classes. """ - return ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[1] + class_index = self.class_indexing if self.class_indexing else {} + return ImageFolderOp.get_num_classes(self.dataset_dir, class_index) def is_shuffled(self): if self.shuffle_level is None: diff --git a/tests/ut/cpp/dataset/c_api_dataset_save.cc b/tests/ut/cpp/dataset/c_api_dataset_save.cc index 88b3786ecd..b1eff2b1f8 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_save.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_save.cc @@ -65,7 +65,9 @@ TEST_F(MindDataTestPipeline, TestSaveCifar10AndLoad) { std::string temp_file = datasets_root_path_ + "/testCifar10Data/mind.mind"; std::string temp_file_db = datasets_root_path_ + "/testCifar10Data/mind.mind.db"; bool rc = ds->Save(temp_file); - EXPECT_EQ(rc, true); + // if save fails, no need to continue the execution + // save could fail if temp_file already exists + ASSERT_EQ(rc, true); // Stage 3: Load dataset from file output by stage 2 // Create a MindData Dataset diff --git a/tests/ut/cpp/dataset/c_api_datasets_test.cc b/tests/ut/cpp/dataset/c_api_datasets_test.cc index 080a60142a..31e7f598e2 100644 --- a/tests/ut/cpp/dataset/c_api_datasets_test.cc +++ b/tests/ut/cpp/dataset/c_api_datasets_test.cc @@ -304,4 +304,22 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithNullSamplerFail) { std::shared_ptr iter = ds->CreateIterator(); // Expect failure: invalid Mnist input, sampler cannot be nullptr EXPECT_EQ(iter, nullptr); -} \ No newline at end of file +} + +TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSize) { + std::string folder_path = datasets_root_path_ + "/testPK/data"; + std::map class_index; + class_index["class1"] = 111; + class_index["class2"] = 333; + auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index); + EXPECT_EQ(ds->GetNumClasses(), 2); +} + +TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSizeFail) { + std::string folder_path = datasets_root_path_ + "/testPK/data"; + std::map class_index; + class_index["class1"] = 111; + class_index["wrong class"] = 333; + auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index); + EXPECT_EQ(ds->GetNumClasses(), -1); +} diff --git a/tests/ut/cpp/dataset/image_folder_op_test.cc b/tests/ut/cpp/dataset/image_folder_op_test.cc index 332343d45f..109536ebc6 100644 --- a/tests/ut/cpp/dataset/image_folder_op_test.cc +++ b/tests/ut/cpp/dataset/image_folder_op_test.cc @@ -38,9 +38,9 @@ namespace common = mindspore::common; using namespace mindspore::dataset; -using mindspore::MsLogLevel::ERROR; -using mindspore::ExceptionType::NoExceptionType; using mindspore::LogStream; +using mindspore::ExceptionType::NoExceptionType; +using mindspore::MsLogLevel::ERROR; std::shared_ptr Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2); @@ -54,14 +54,17 @@ std::shared_ptr ImageFolder(int64_t num_works, int64_t rows, int6 std::shared_ptr so; ImageFolderOp::Builder builder; Status rc = builder.SetNumWorkers(num_works) - .SetImageFolderDir(path) - .SetRowsPerBuffer(rows) - .SetOpConnectorSize(conns) - .SetExtensions({".jpg", ".JPEG"}) - .SetSampler(std::move(sampler)) - .SetClassIndex(map) - .SetDecode(decode) - .Build(&so); + .SetImageFolderDir(path) + .SetRowsPerBuffer(rows) + .SetOpConnectorSize(conns) + .SetExtensions({".jpg", ".JPEG"}) + .SetSampler(std::move(sampler)) + .SetClassIndex(map) + .SetDecode(decode) + .Build(&so); + if (rc.IsError()) { + MS_LOG(ERROR) << "Fail to build ImageFolderOp: " << rc.ToString() << "\n"; + } return so; } @@ -166,9 +169,9 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch) auto tree = Build({ImageFolder(16, 2, 32, folder_path, false), Repeat(2), Batch(11)}); tree->Prepare(); int32_t res[4][11] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, - {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, - {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}; + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}; Status rc = tree->Launch(); if (rc.IsError()) { MS_LOG(ERROR) << "Return code error detected during tree launch: " << common::SafeCStr(rc.ToString()) << "."; @@ -184,7 +187,7 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch) Create1DTensor(&label, 11, reinterpret_cast(res[i % 4]), DataType::DE_INT32); EXPECT_TRUE((*label) == (*tensor_map["label"])); MS_LOG(DEBUG) << "row: " << i << " " << tensor_map["image"]->shape() << " (*label):" << (*label) - << " *tensor_map[label]: " << *tensor_map["label"] << std::endl; + << " *tensor_map[label]: " << *tensor_map["label"] << std::endl; i++; di.GetNextAsMap(&tensor_map); } @@ -373,8 +376,8 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderDecode) { while (tensor_map.size() != 0) { tensor_map["label"]->GetItemAt(&label, {}); EXPECT_TRUE(label == res[i / 11]); - EXPECT_TRUE( - tensor_map["image"]->shape() == TensorShape({2268, 4032, 3})); // verify shapes are correct after decode + EXPECT_TRUE(tensor_map["image"]->shape() == + TensorShape({2268, 4032, 3})); // verify shapes are correct after decode MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n"; i++; di.GetNextAsMap(&tensor_map); diff --git a/tests/ut/python/dataset/test_get_size.py b/tests/ut/python/dataset/test_get_size.py index 3fb305d211..0c1daed729 100644 --- a/tests/ut/python/dataset/test_get_size.py +++ b/tests/ut/python/dataset/test_get_size.py @@ -158,13 +158,23 @@ def test_imagefolder(): assert data.get_dataset_size() == 10 assert data.num_classes() == 4 + data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "class2": 22}) + assert data.num_classes() == 2 + + data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "wrong name": 22}) + err_msg = "" + try: + data.num_classes() + except RuntimeError as e: + err_msg = str(e) + assert "wrong name doesn't exist" in err_msg + if __name__ == '__main__': - # test_compare_v1_and_2() - # test_imagefolder() - # test_manifest() + test_manifest() test_case1() - # test_case2() - # test_case3() - # test_case4() - # test_case5() + test_case2() + test_case3() + test_case4() + test_case5() + test_imagefolder()