!8538 fix get_num_class in imagefolder

From: @ziruiwu
Reviewed-by: @robingrosman,@nsyca
Signed-off-by: @nsyca
pull/8538/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit d2427b6e89

@ -91,11 +91,17 @@ PYBIND_REGISTER(CocoOp, 1, ([](const py::module *m) {
PYBIND_REGISTER(ImageFolderOp, 1, ([](const py::module *m) {
(void)py::class_<ImageFolderOp, DatasetOp, std::shared_ptr<ImageFolderOp>>(*m, "ImageFolderOp")
.def_static("get_num_rows_and_classes", [](const std::string &path) {
int64_t count = 0, num_classes = 0;
THROW_IF_ERROR(
ImageFolderOp::CountRowsAndClasses(path, std::set<std::string>{}, &count, &num_classes));
return py::make_tuple(count, num_classes);
.def_static("get_num_rows",
[](const std::string &path) {
int64_t count = 0;
THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, &count, nullptr, {}));
return count;
})
.def_static("get_num_classes", [](const std::string &path,
const std::map<std::string, int32_t> class_index) {
int64_t num_classes = 0;
THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, nullptr, &num_classes, class_index));
return num_classes;
});
}));

@ -15,7 +15,7 @@
*/
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
#include <fstream>
#include <iomanip>
#include <unordered_set>
#include "utils/ms_utils.h"
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/tensor_shape.h"
@ -280,7 +280,7 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i
RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not.");
} else {
RETURN_STATUS_UNEXPECTED(
"Map for storaging image-index pair is nullptr or has been set in other place,"
"Map containing image-index pair is nullptr or has been set in other place,"
"it must be empty before using GetClassIds.");
}
}
@ -294,14 +294,14 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i
}
// Worker Entry for pre-scanning all the folders and do the 1st level shuffle
// Worker pull a file name from mFoldernameQueue (which is a Queue), walks all the images under that foldername
// Worker pull a file name from folder_name_queue_ (which is a Queue), walks all the images under that foldername
// After walking is complete, sort all the file names (relative path to all jpeg files under the same directory )
// (Sort is automatically conducted using a set which is implemented using a Red-Black Tree)
// Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*),
// foldername is used for 2nd level sorting.
// FYI: 1st level sorting: sort all images under the same directory.
// FYI: 2nd level sorting: sort all folder names
// push this pair to mImagenameQueue (which is again a Queue)
// push this pair to image_name_queue (which is again a Queue)
Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
TaskManager::FindMe()->Post();
std::string folder_name;
@ -334,7 +334,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
return Status::OK();
}
// This helper function recursively walks all foldernames, and send each foldername to mFoldernameQueue
// This helper function recursively walks all folder_paths, and send each foldername to folder_name_queue_
// if mRecursive == false, don't go into folder of folders
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
@ -355,7 +355,7 @@ Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
}
// A thread that calls RecursiveWalkFolder
Status ImageFolderOp::startAsyncWalk() {
Status ImageFolderOp::StartAsyncWalk() {
TaskManager::FindMe()->Post();
Path dir(folder_path_);
if (dir.Exists() == false || dir.IsDirectory() == false) {
@ -363,8 +363,8 @@ Status ImageFolderOp::startAsyncWalk() {
}
dirname_offset_ = folder_path_.length();
RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir));
// send out num_workers_ end signal to mFoldernameQueue, 1 for each worker.
// Upon receiving end Signal, worker quits and set another end Signal to mImagenameQueue.
// send out num_workers_ end signal to folder_name_queue_, 1 for each worker.
// Upon receiving end Signal, worker quits and set another end Signal to image_name_queue.
for (int32_t ind = 0; ind < num_workers_; ++ind) {
RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal
}
@ -372,19 +372,17 @@ Status ImageFolderOp::startAsyncWalk() {
}
Status ImageFolderOp::LaunchThreadsAndInitOp() {
if (tree_ == nullptr) {
RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
}
RETURN_UNEXPECTED_IF_NULL(tree_);
// Registers QueueList and individual Queues for interrupt services
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks()));
RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks()));
RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
// The following code launch 3 threads group
// 1) A thread that walks all folders and push the folder names to a util:Queue mFoldernameQueue.
// 2) Workers that pull foldername from mFoldernameQueue, walk it and return the sorted images to mImagenameQueue
// 1) A thread that walks all folders and push the folder names to a util:Queue folder_name_queue_.
// 2) Workers that pull foldername from folder_name_queue_, walk it and return the sorted images to image_name_queue
// 3) Launch main workers that load DataBuffers by reading all images
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::startAsyncWalk, this)));
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::StartAsyncWalk, this)));
RETURN_IF_NOT_OK(
tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1)));
RETURN_IF_NOT_OK(
@ -397,42 +395,53 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() {
}
Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
int64_t *num_classes, int64_t dev_id, int64_t num_dev) {
int64_t *num_classes, std::map<std::string, int32_t> class_index) {
Path dir(path);
std::string err_msg = "";
int64_t row_cnt = 0;
err_msg += (dir.Exists() == false || dir.IsDirectory() == false)
? "Invalid parameter, image folde path is invalid or not set, path: " + path
? "Invalid parameter, image folder path is invalid or not set, path: " + path
: "";
err_msg +=
(num_classes == nullptr || num_rows == nullptr) ? "Invalid parameter, num_class or num_rows cannot be null.\n" : "";
err_msg += (dev_id >= num_dev || num_dev <= 0)
? "Invalid parameter, num_shard must be greater than shard_id and greater than 0, got num_shard: " +
std::to_string(num_dev) + ", shard_id: " + std::to_string(dev_id) + ".\n"
: "";
(num_classes == nullptr && num_rows == nullptr) ? "Invalid parameter, num_class and num_rows are null.\n" : "";
if (err_msg.empty() == false) {
RETURN_STATUS_UNEXPECTED(err_msg);
}
std::queue<std::string> foldernames;
std::queue<std::string> folder_paths;
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir);
std::unordered_set<std::string> folder_names;
while (dir_itr->hasNext()) {
Path subdir = dir_itr->next();
if (subdir.IsDirectory()) {
foldernames.push(subdir.toString());
folder_paths.push(subdir.toString());
if (!class_index.empty()) folder_names.insert(subdir.Basename());
}
}
(*num_classes) = foldernames.size();
while (foldernames.empty() == false) {
Path subdir(foldernames.front());
if (num_classes != nullptr) {
// if class index is empty, get everything on disk
if (class_index.empty()) {
*num_classes = folder_paths.size();
} else {
for (const auto &p : class_index) {
CHECK_FAIL_RETURN_UNEXPECTED(folder_names.find(p.first) != folder_names.end(),
"folder: " + p.first + " doesn't exist in " + path + " .");
}
(*num_classes) = class_index.size();
}
}
// return here if only num_class is needed
RETURN_OK_IF_TRUE(num_rows == nullptr);
while (folder_paths.empty() == false) {
Path subdir(folder_paths.front());
dir_itr = Path::DirIterator::OpenDirectory(&subdir);
while (dir_itr->hasNext()) {
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
++row_cnt;
}
}
foldernames.pop();
folder_paths.pop();
}
(*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1);
(*num_rows) = row_cnt;
return Status::OK();
}
@ -460,9 +469,12 @@ Status ImageFolderOp::GetDatasetSize(int64_t *dataset_size) {
*dataset_size = dataset_size_;
return Status::OK();
}
int64_t sample_size, num_rows, num_classes;
int64_t sample_size, num_rows;
num_rows = num_rows_;
if (num_rows_ <= 0) RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, &num_classes));
if (num_rows_ <= 0) {
// GetDatasetSize will not be impacted by class_index_
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, nullptr, {}));
}
sample_size = sampler_->GetNumSamples();
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows;
dataset_size_ = *dataset_size;
@ -475,8 +487,7 @@ Status ImageFolderOp::GetNumClasses(int64_t *num_classes) {
*num_classes = num_classes_;
return Status::OK();
}
int64_t num_rows = num_rows_;
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, num_classes));
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, nullptr, num_classes, class_index_));
num_classes_ = *num_classes;
return Status::OK();
}

@ -205,7 +205,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
// returned by this function may not be consistent with what image_folder_op is going to return
// user this at your own risk!
static Status CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
int64_t *num_classes, int64_t dev_id = 0, int64_t num_dev = 1);
int64_t *num_classes, std::map<std::string, int32_t> class_index);
// Base-class override for NodePass visitor acceptor.
// @param p - Pointer to the NodePass to be accepted.
@ -251,7 +251,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
// start walking of all dirs
// @return
Status startAsyncWalk();
Status StartAsyncWalk();
// Called first when function is called
// @return

@ -81,14 +81,16 @@ std::vector<std::shared_ptr<DatasetOp>> BatchNode::Build() {
std::vector<std::shared_ptr<DatasetOp>> node_ops;
#ifdef ENABLE_PYTHON
node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_,
in_col_names_, out_col_names_, batch_size_func_, batch_map_func_,
pad_map_));
// need to insert a project when per_batch_func changes the number of columns
// if col_order_ isn't empty, then a project node needs to be attached after batch node. (same as map)
// this means project_node needs to be the parent of batch_node. this means node_ops = [project_node, batch_node]
if (!col_order_.empty()) {
auto project_op = std::make_shared<ProjectOp>(col_order_);
node_ops.push_back(project_op);
}
node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_,
in_col_names_, out_col_names_, batch_size_func_, batch_map_func_,
pad_map_));
#else
node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_,
in_col_names_, pad_map_));

@ -2891,7 +2891,7 @@ class ImageFolderDataset(MappableDataset):
Number, number of batches.
"""
if self.dataset_size is None:
num_rows = ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[0]
num_rows = ImageFolderOp.get_num_rows(self.dataset_dir)
self.dataset_size = get_num_rows(num_rows, self.num_shards)
rows_from_sampler = self._get_sampler_dataset_size()
if rows_from_sampler is not None and rows_from_sampler < self.dataset_size:
@ -2905,7 +2905,8 @@ class ImageFolderDataset(MappableDataset):
Return:
Number, number of classes.
"""
return ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[1]
class_index = self.class_indexing if self.class_indexing else {}
return ImageFolderOp.get_num_classes(self.dataset_dir, class_index)
def is_shuffled(self):
if self.shuffle_level is None:

@ -65,7 +65,9 @@ TEST_F(MindDataTestPipeline, TestSaveCifar10AndLoad) {
std::string temp_file = datasets_root_path_ + "/testCifar10Data/mind.mind";
std::string temp_file_db = datasets_root_path_ + "/testCifar10Data/mind.mind.db";
bool rc = ds->Save(temp_file);
EXPECT_EQ(rc, true);
// if save fails, no need to continue the execution
// save could fail if temp_file already exists
ASSERT_EQ(rc, true);
// Stage 3: Load dataset from file output by stage 2
// Create a MindData Dataset

@ -305,3 +305,21 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithNullSamplerFail) {
// Expect failure: invalid Mnist input, sampler cannot be nullptr
EXPECT_EQ(iter, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSize) {
std::string folder_path = datasets_root_path_ + "/testPK/data";
std::map<std::string, int32_t> class_index;
class_index["class1"] = 111;
class_index["class2"] = 333;
auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index);
EXPECT_EQ(ds->GetNumClasses(), 2);
}
TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSizeFail) {
std::string folder_path = datasets_root_path_ + "/testPK/data";
std::map<std::string, int32_t> class_index;
class_index["class1"] = 111;
class_index["wrong class"] = 333;
auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index);
EXPECT_EQ(ds->GetNumClasses(), -1);
}

@ -38,9 +38,9 @@
namespace common = mindspore::common;
using namespace mindspore::dataset;
using mindspore::MsLogLevel::ERROR;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::MsLogLevel::ERROR;
std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2);
@ -54,14 +54,17 @@ std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int6
std::shared_ptr<ImageFolderOp> so;
ImageFolderOp::Builder builder;
Status rc = builder.SetNumWorkers(num_works)
.SetImageFolderDir(path)
.SetRowsPerBuffer(rows)
.SetOpConnectorSize(conns)
.SetExtensions({".jpg", ".JPEG"})
.SetSampler(std::move(sampler))
.SetClassIndex(map)
.SetDecode(decode)
.Build(&so);
.SetImageFolderDir(path)
.SetRowsPerBuffer(rows)
.SetOpConnectorSize(conns)
.SetExtensions({".jpg", ".JPEG"})
.SetSampler(std::move(sampler))
.SetClassIndex(map)
.SetDecode(decode)
.Build(&so);
if (rc.IsError()) {
MS_LOG(ERROR) << "Fail to build ImageFolderOp: " << rc.ToString() << "\n";
}
return so;
}
@ -166,9 +169,9 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch)
auto tree = Build({ImageFolder(16, 2, 32, folder_path, false), Repeat(2), Batch(11)});
tree->Prepare();
int32_t res[4][11] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}};
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}};
Status rc = tree->Launch();
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during tree launch: " << common::SafeCStr(rc.ToString()) << ".";
@ -184,7 +187,7 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch)
Create1DTensor(&label, 11, reinterpret_cast<unsigned char *>(res[i % 4]), DataType::DE_INT32);
EXPECT_TRUE((*label) == (*tensor_map["label"]));
MS_LOG(DEBUG) << "row: " << i << " " << tensor_map["image"]->shape() << " (*label):" << (*label)
<< " *tensor_map[label]: " << *tensor_map["label"] << std::endl;
<< " *tensor_map[label]: " << *tensor_map["label"] << std::endl;
i++;
di.GetNextAsMap(&tensor_map);
}
@ -373,8 +376,8 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderDecode) {
while (tensor_map.size() != 0) {
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
EXPECT_TRUE(label == res[i / 11]);
EXPECT_TRUE(
tensor_map["image"]->shape() == TensorShape({2268, 4032, 3})); // verify shapes are correct after decode
EXPECT_TRUE(tensor_map["image"]->shape() ==
TensorShape({2268, 4032, 3})); // verify shapes are correct after decode
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n";
i++;
di.GetNextAsMap(&tensor_map);

@ -158,13 +158,23 @@ def test_imagefolder():
assert data.get_dataset_size() == 10
assert data.num_classes() == 4
data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "class2": 22})
assert data.num_classes() == 2
data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "wrong name": 22})
err_msg = ""
try:
data.num_classes()
except RuntimeError as e:
err_msg = str(e)
assert "wrong name doesn't exist" in err_msg
if __name__ == '__main__':
# test_compare_v1_and_2()
# test_imagefolder()
# test_manifest()
test_manifest()
test_case1()
# test_case2()
# test_case3()
# test_case4()
# test_case5()
test_case2()
test_case3()
test_case4()
test_case5()
test_imagefolder()

Loading…
Cancel
Save