|
|
@ -15,7 +15,7 @@
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
|
|
|
|
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
|
|
|
|
#include <fstream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <iomanip>
|
|
|
|
#include <unordered_set>
|
|
|
|
#include "utils/ms_utils.h"
|
|
|
|
#include "utils/ms_utils.h"
|
|
|
|
#include "minddata/dataset/core/config_manager.h"
|
|
|
|
#include "minddata/dataset/core/config_manager.h"
|
|
|
|
#include "minddata/dataset/core/tensor_shape.h"
|
|
|
|
#include "minddata/dataset/core/tensor_shape.h"
|
|
|
@ -280,7 +280,7 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i
|
|
|
|
RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not.");
|
|
|
|
RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not.");
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
RETURN_STATUS_UNEXPECTED(
|
|
|
|
RETURN_STATUS_UNEXPECTED(
|
|
|
|
"Map for storaging image-index pair is nullptr or has been set in other place,"
|
|
|
|
"Map containing image-index pair is nullptr or has been set in other place,"
|
|
|
|
"it must be empty before using GetClassIds.");
|
|
|
|
"it must be empty before using GetClassIds.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -294,14 +294,14 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Worker Entry for pre-scanning all the folders and do the 1st level shuffle
|
|
|
|
// Worker Entry for pre-scanning all the folders and do the 1st level shuffle
|
|
|
|
// Worker pull a file name from mFoldernameQueue (which is a Queue), walks all the images under that foldername
|
|
|
|
// Worker pull a file name from folder_name_queue_ (which is a Queue), walks all the images under that foldername
|
|
|
|
// After walking is complete, sort all the file names (relative path to all jpeg files under the same directory )
|
|
|
|
// After walking is complete, sort all the file names (relative path to all jpeg files under the same directory )
|
|
|
|
// (Sort is automatically conducted using a set which is implemented using a Red-Black Tree)
|
|
|
|
// (Sort is automatically conducted using a set which is implemented using a Red-Black Tree)
|
|
|
|
// Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*),
|
|
|
|
// Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*),
|
|
|
|
// foldername is used for 2nd level sorting.
|
|
|
|
// foldername is used for 2nd level sorting.
|
|
|
|
// FYI: 1st level sorting: sort all images under the same directory.
|
|
|
|
// FYI: 1st level sorting: sort all images under the same directory.
|
|
|
|
// FYI: 2nd level sorting: sort all folder names
|
|
|
|
// FYI: 2nd level sorting: sort all folder names
|
|
|
|
// push this pair to mImagenameQueue (which is again a Queue)
|
|
|
|
// push this pair to image_name_queue (which is again a Queue)
|
|
|
|
Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
|
|
|
|
Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
|
|
|
|
TaskManager::FindMe()->Post();
|
|
|
|
TaskManager::FindMe()->Post();
|
|
|
|
std::string folder_name;
|
|
|
|
std::string folder_name;
|
|
|
@ -334,7 +334,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
|
|
|
|
return Status::OK();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// This helper function recursively walks all foldernames, and send each foldername to mFoldernameQueue
|
|
|
|
// This helper function recursively walks all folder_paths, and send each foldername to folder_name_queue_
|
|
|
|
// if mRecursive == false, don't go into folder of folders
|
|
|
|
// if mRecursive == false, don't go into folder of folders
|
|
|
|
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
|
|
|
|
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
|
|
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
|
|
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
|
|
|
@ -355,7 +355,7 @@ Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// A thread that calls RecursiveWalkFolder
|
|
|
|
// A thread that calls RecursiveWalkFolder
|
|
|
|
Status ImageFolderOp::startAsyncWalk() {
|
|
|
|
Status ImageFolderOp::StartAsyncWalk() {
|
|
|
|
TaskManager::FindMe()->Post();
|
|
|
|
TaskManager::FindMe()->Post();
|
|
|
|
Path dir(folder_path_);
|
|
|
|
Path dir(folder_path_);
|
|
|
|
if (dir.Exists() == false || dir.IsDirectory() == false) {
|
|
|
|
if (dir.Exists() == false || dir.IsDirectory() == false) {
|
|
|
@ -363,8 +363,8 @@ Status ImageFolderOp::startAsyncWalk() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
dirname_offset_ = folder_path_.length();
|
|
|
|
dirname_offset_ = folder_path_.length();
|
|
|
|
RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir));
|
|
|
|
RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir));
|
|
|
|
// send out num_workers_ end signal to mFoldernameQueue, 1 for each worker.
|
|
|
|
// send out num_workers_ end signal to folder_name_queue_, 1 for each worker.
|
|
|
|
// Upon receiving end Signal, worker quits and set another end Signal to mImagenameQueue.
|
|
|
|
// Upon receiving end Signal, worker quits and set another end Signal to image_name_queue.
|
|
|
|
for (int32_t ind = 0; ind < num_workers_; ++ind) {
|
|
|
|
for (int32_t ind = 0; ind < num_workers_; ++ind) {
|
|
|
|
RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal
|
|
|
|
RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -372,19 +372,17 @@ Status ImageFolderOp::startAsyncWalk() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Status ImageFolderOp::LaunchThreadsAndInitOp() {
|
|
|
|
Status ImageFolderOp::LaunchThreadsAndInitOp() {
|
|
|
|
if (tree_ == nullptr) {
|
|
|
|
RETURN_UNEXPECTED_IF_NULL(tree_);
|
|
|
|
RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// Registers QueueList and individual Queues for interrupt services
|
|
|
|
// Registers QueueList and individual Queues for interrupt services
|
|
|
|
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
|
|
|
|
RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
|
|
|
|
// The following code launch 3 threads group
|
|
|
|
// The following code launch 3 threads group
|
|
|
|
// 1) A thread that walks all folders and push the folder names to a util:Queue mFoldernameQueue.
|
|
|
|
// 1) A thread that walks all folders and push the folder names to a util:Queue folder_name_queue_.
|
|
|
|
// 2) Workers that pull foldername from mFoldernameQueue, walk it and return the sorted images to mImagenameQueue
|
|
|
|
// 2) Workers that pull foldername from folder_name_queue_, walk it and return the sorted images to image_name_queue
|
|
|
|
// 3) Launch main workers that load DataBuffers by reading all images
|
|
|
|
// 3) Launch main workers that load DataBuffers by reading all images
|
|
|
|
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::startAsyncWalk, this)));
|
|
|
|
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::StartAsyncWalk, this)));
|
|
|
|
RETURN_IF_NOT_OK(
|
|
|
|
RETURN_IF_NOT_OK(
|
|
|
|
tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1)));
|
|
|
|
tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1)));
|
|
|
|
RETURN_IF_NOT_OK(
|
|
|
|
RETURN_IF_NOT_OK(
|
|
|
@ -397,42 +395,53 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
|
|
|
|
Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
|
|
|
|
int64_t *num_classes, int64_t dev_id, int64_t num_dev) {
|
|
|
|
int64_t *num_classes, std::map<std::string, int32_t> class_index) {
|
|
|
|
Path dir(path);
|
|
|
|
Path dir(path);
|
|
|
|
std::string err_msg = "";
|
|
|
|
std::string err_msg = "";
|
|
|
|
int64_t row_cnt = 0;
|
|
|
|
int64_t row_cnt = 0;
|
|
|
|
err_msg += (dir.Exists() == false || dir.IsDirectory() == false)
|
|
|
|
err_msg += (dir.Exists() == false || dir.IsDirectory() == false)
|
|
|
|
? "Invalid parameter, image folde path is invalid or not set, path: " + path
|
|
|
|
? "Invalid parameter, image folder path is invalid or not set, path: " + path
|
|
|
|
: "";
|
|
|
|
: "";
|
|
|
|
err_msg +=
|
|
|
|
err_msg +=
|
|
|
|
(num_classes == nullptr || num_rows == nullptr) ? "Invalid parameter, num_class or num_rows cannot be null.\n" : "";
|
|
|
|
(num_classes == nullptr && num_rows == nullptr) ? "Invalid parameter, num_class and num_rows are null.\n" : "";
|
|
|
|
err_msg += (dev_id >= num_dev || num_dev <= 0)
|
|
|
|
|
|
|
|
? "Invalid parameter, num_shard must be greater than shard_id and greater than 0, got num_shard: " +
|
|
|
|
|
|
|
|
std::to_string(num_dev) + ", shard_id: " + std::to_string(dev_id) + ".\n"
|
|
|
|
|
|
|
|
: "";
|
|
|
|
|
|
|
|
if (err_msg.empty() == false) {
|
|
|
|
if (err_msg.empty() == false) {
|
|
|
|
RETURN_STATUS_UNEXPECTED(err_msg);
|
|
|
|
RETURN_STATUS_UNEXPECTED(err_msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::queue<std::string> foldernames;
|
|
|
|
std::queue<std::string> folder_paths;
|
|
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir);
|
|
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir);
|
|
|
|
|
|
|
|
std::unordered_set<std::string> folder_names;
|
|
|
|
while (dir_itr->hasNext()) {
|
|
|
|
while (dir_itr->hasNext()) {
|
|
|
|
Path subdir = dir_itr->next();
|
|
|
|
Path subdir = dir_itr->next();
|
|
|
|
if (subdir.IsDirectory()) {
|
|
|
|
if (subdir.IsDirectory()) {
|
|
|
|
foldernames.push(subdir.toString());
|
|
|
|
folder_paths.push(subdir.toString());
|
|
|
|
|
|
|
|
if (!class_index.empty()) folder_names.insert(subdir.Basename());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*num_classes) = foldernames.size();
|
|
|
|
if (num_classes != nullptr) {
|
|
|
|
while (foldernames.empty() == false) {
|
|
|
|
// if class index is empty, get everything on disk
|
|
|
|
Path subdir(foldernames.front());
|
|
|
|
if (class_index.empty()) {
|
|
|
|
|
|
|
|
*num_classes = folder_paths.size();
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
for (const auto &p : class_index) {
|
|
|
|
|
|
|
|
CHECK_FAIL_RETURN_UNEXPECTED(folder_names.find(p.first) != folder_names.end(),
|
|
|
|
|
|
|
|
"folder: " + p.first + " doesn't exist in " + path + " .");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
(*num_classes) = class_index.size();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// return here if only num_class is needed
|
|
|
|
|
|
|
|
RETURN_OK_IF_TRUE(num_rows == nullptr);
|
|
|
|
|
|
|
|
while (folder_paths.empty() == false) {
|
|
|
|
|
|
|
|
Path subdir(folder_paths.front());
|
|
|
|
dir_itr = Path::DirIterator::OpenDirectory(&subdir);
|
|
|
|
dir_itr = Path::DirIterator::OpenDirectory(&subdir);
|
|
|
|
while (dir_itr->hasNext()) {
|
|
|
|
while (dir_itr->hasNext()) {
|
|
|
|
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
|
|
|
|
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
|
|
|
|
++row_cnt;
|
|
|
|
++row_cnt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
foldernames.pop();
|
|
|
|
folder_paths.pop();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1);
|
|
|
|
(*num_rows) = row_cnt;
|
|
|
|
return Status::OK();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -460,9 +469,12 @@ Status ImageFolderOp::GetDatasetSize(int64_t *dataset_size) {
|
|
|
|
*dataset_size = dataset_size_;
|
|
|
|
*dataset_size = dataset_size_;
|
|
|
|
return Status::OK();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int64_t sample_size, num_rows, num_classes;
|
|
|
|
int64_t sample_size, num_rows;
|
|
|
|
num_rows = num_rows_;
|
|
|
|
num_rows = num_rows_;
|
|
|
|
if (num_rows_ <= 0) RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, &num_classes));
|
|
|
|
if (num_rows_ <= 0) {
|
|
|
|
|
|
|
|
// GetDatasetSize will not be impacted by class_index_
|
|
|
|
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, nullptr, {}));
|
|
|
|
|
|
|
|
}
|
|
|
|
sample_size = sampler_->GetNumSamples();
|
|
|
|
sample_size = sampler_->GetNumSamples();
|
|
|
|
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows;
|
|
|
|
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows;
|
|
|
|
dataset_size_ = *dataset_size;
|
|
|
|
dataset_size_ = *dataset_size;
|
|
|
@ -475,8 +487,7 @@ Status ImageFolderOp::GetNumClasses(int64_t *num_classes) {
|
|
|
|
*num_classes = num_classes_;
|
|
|
|
*num_classes = num_classes_;
|
|
|
|
return Status::OK();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int64_t num_rows = num_rows_;
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, nullptr, num_classes, class_index_));
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, num_classes));
|
|
|
|
|
|
|
|
num_classes_ = *num_classes;
|
|
|
|
num_classes_ = *num_classes;
|
|
|
|
return Status::OK();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|