!13290 Add a base class for Mappable source ops
From: @hfarahat Reviewed-by: Signed-off-by:pull/13290/MERGE
commit
1edbbe56ba
@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/datasetops/source/mappable_leaf_op.h"
|
||||
#include <fstream>
|
||||
#include <unordered_set>
|
||||
#include "utils/ms_utils.h"
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/core/tensor_shape.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
|
||||
#include "minddata/dataset/engine/db_connector.h"
|
||||
#include "minddata/dataset/engine/execution_tree.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
MappableLeafOp::MappableLeafOp(int32_t num_wkrs, int32_t queue_size, std::shared_ptr<SamplerRT> sampler,
|
||||
int32_t rows_per_buffer)
|
||||
: ParallelOp(num_wkrs, queue_size, std::move(sampler)),
|
||||
row_cnt_(0),
|
||||
buf_cnt_(0),
|
||||
rows_per_buffer_(rows_per_buffer) {}
|
||||
|
||||
// Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work
|
||||
Status MappableLeafOp::operator()() {
|
||||
RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
|
||||
std::unique_ptr<DataBuffer> sampler_buffer;
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
while (true) { // each iterator is 1 epoch
|
||||
std::vector<int64_t> keys;
|
||||
keys.reserve(rows_per_buffer_);
|
||||
while (sampler_buffer->eoe() == false) {
|
||||
TensorRow sample_row;
|
||||
RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row));
|
||||
std::shared_ptr<Tensor> sample_ids = sample_row[0];
|
||||
for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
|
||||
if ((*itr) >= num_rows_) continue; // index out of bound, skipping
|
||||
keys.push_back(*itr);
|
||||
row_cnt_++;
|
||||
if (row_cnt_ % rows_per_buffer_ == 0) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
|
||||
keys.clear();
|
||||
}
|
||||
}
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
}
|
||||
if (keys.empty() == false) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
|
||||
}
|
||||
if (IsLastIteration()) {
|
||||
std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
|
||||
std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
|
||||
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
|
||||
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
|
||||
for (int32_t i = 0; i < num_workers_; ++i) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
|
||||
}
|
||||
return Status::OK();
|
||||
} else { // not the last repeat.
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
|
||||
}
|
||||
|
||||
if (epoch_sync_flag_) {
|
||||
// If epoch_sync_flag_ is set, then master thread sleeps until all the worker threads have finished their job for
|
||||
// the current epoch.
|
||||
RETURN_IF_NOT_OK(WaitForWorkers());
|
||||
}
|
||||
// If not the last repeat, self-reset and go to loop again.
|
||||
if (!IsLastIteration()) {
|
||||
RETURN_IF_NOT_OK(Reset());
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
}
|
||||
UpdateRepeatAndEpochCounter();
|
||||
}
|
||||
}
|
||||
|
||||
// Reset Sampler and wakeup Master thread (functor)
|
||||
Status MappableLeafOp::Reset() {
|
||||
MS_LOG(DEBUG) << Name() << " performing a self-reset.";
|
||||
RETURN_IF_NOT_OK(sampler_->ResetSampler());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// hand shake with Sampler, allow Sampler to call RandomAccessOp's functions to get NumRows
|
||||
Status MappableLeafOp::InitSampler() {
|
||||
RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// contains the main logic of pulling a IOBlock from IOBlockQueue, load a buffer and push the buffer to out_connector_
|
||||
// IMPORTANT: 1 IOBlock produces 1 DataBuffer
|
||||
Status MappableLeafOp::WorkerEntry(int32_t worker_id) {
|
||||
TaskManager::FindMe()->Post();
|
||||
int64_t buffer_id = worker_id;
|
||||
std::unique_ptr<IOBlock> io_block;
|
||||
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
|
||||
while (io_block != nullptr) {
|
||||
if (io_block->wait() == true) {
|
||||
// Sync io_block is a signal that master thread wants us to pause and sync with other workers.
|
||||
// The last guy who comes to this sync point should reset the counter and wake up the master thread.
|
||||
if (++num_workers_paused_ == num_workers_) {
|
||||
wait_for_workers_post_.Set();
|
||||
}
|
||||
} else if (io_block->eoe() == true) {
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
|
||||
buffer_id = worker_id;
|
||||
} else if (io_block->eof() == true) {
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
|
||||
} else {
|
||||
std::vector<int64_t> keys;
|
||||
RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
|
||||
if (keys.empty() == true) return Status::OK(); // empty key is a quit signal for workers
|
||||
std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
|
||||
RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
|
||||
buffer_id += num_workers_;
|
||||
}
|
||||
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
|
||||
}
|
||||
RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker");
|
||||
}
|
||||
|
||||
// Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
|
||||
Status MappableLeafOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
|
||||
std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
|
||||
TensorRow trow;
|
||||
for (const int64_t &key : keys) {
|
||||
RETURN_IF_NOT_OK(this->LoadTensorRow(key, &trow));
|
||||
deq->push_back(std::move(trow));
|
||||
}
|
||||
(*db)->set_tensor_table(std::move(deq));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,110 @@
|
||||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_MAPPABLE_LEAF_OP_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_MAPPABLE_LEAF_OP_H_
|
||||
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/engine/data_buffer.h"
|
||||
#include "minddata/dataset/engine/data_schema.h"
|
||||
#include "minddata/dataset/engine/datasetops/parallel_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "minddata/dataset/kernels/image/image_utils.h"
|
||||
#else
|
||||
#include "minddata/dataset/kernels/image/lite_image_utils.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/util/services.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/util/wait_post.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
// Forward declares
|
||||
template <typename T>
|
||||
class Queue;
|
||||
|
||||
using ImageLabelPair = std::shared_ptr<std::pair<std::string, int32_t>>;
|
||||
using FolderImagesPair = std::shared_ptr<std::pair<std::string, std::queue<ImageLabelPair>>>;
|
||||
|
||||
class MappableLeafOp : public ParallelOp, public RandomAccessOp {
|
||||
public:
|
||||
// Constructor
|
||||
// @param int32_t num_wkrs - Num of workers reading images in parallel
|
||||
// @param int32_t - rows_per_buffer Number of images (rows) in each buffer
|
||||
// @param std::string - dir directory of ImageNetFolder
|
||||
// @param int32_t queue_size - connector queue size
|
||||
// @param std::set<std::string> exts - set of file extensions to read, if empty, read everything under the dir
|
||||
// @param td::unique_ptr<Sampler> sampler - sampler tells the source what to read
|
||||
MappableLeafOp(int32_t num_wkrs, int32_t queue_size, std::shared_ptr<SamplerRT> sampler, int32_t rows_per_buffer);
|
||||
|
||||
// Destructor.
|
||||
~MappableLeafOp() = default;
|
||||
|
||||
// Main Loop of MappableLeaf
|
||||
// Master thread: Fill IOBlockQueue, then goes to sleep
|
||||
// Worker thread: pulls IOBlock from IOBlockQueue, work on it then put buffer to mOutConnector
|
||||
// @return Status The status code returned
|
||||
Status operator()() override;
|
||||
|
||||
// Op name getter
|
||||
// @return Name of the current Op
|
||||
std::string Name() const override { return "MappableLeafPp"; }
|
||||
|
||||
protected:
|
||||
// Initialize Sampler, calls sampler->Init() within
|
||||
// @return Status The status code returned
|
||||
Status InitSampler();
|
||||
|
||||
// // Called first when function is called
|
||||
// // @return
|
||||
virtual Status LaunchThreadsAndInitOp() = 0;
|
||||
|
||||
Status WorkerEntry(int32_t workerId) override;
|
||||
|
||||
// @param const std::vector<int64_t> &keys - keys in ioblock
|
||||
// @param std::unique_ptr<DataBuffer> db
|
||||
// @return Status The status code returned
|
||||
Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db);
|
||||
|
||||
// Load a tensor row according to a pair
|
||||
// @param row_id_type row_id - id for this tensor row
|
||||
// @param ImageLabelPair pair - <imagefile,label>
|
||||
// @param TensorRow row - loaded row
|
||||
// @return Status The status code returned
|
||||
virtual Status LoadTensorRow(row_id_type row_id, TensorRow *row) = 0;
|
||||
|
||||
// reset Op
|
||||
// @return Status The status code returned
|
||||
Status Reset() override;
|
||||
|
||||
int32_t rows_per_buffer_;
|
||||
int64_t row_cnt_;
|
||||
int64_t buf_cnt_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_MAPPABLE_LEAF_OP_H_
|
Loading…
Reference in new issue