|
|
|
@ -37,11 +37,49 @@ namespace {
|
|
|
|
|
int kProgramId = -1;
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
|
|
|
|
|
const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
|
|
|
|
|
std::unordered_map<std::string, size_t> ref_cnts;
|
|
|
|
|
std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
|
|
|
|
|
skip_var_list.end());
|
|
|
|
|
|
|
|
|
|
auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
|
|
|
|
|
for (auto& name_pair : name_map) {
|
|
|
|
|
for (auto& name : name_pair.second) {
|
|
|
|
|
if (skip_vars.count(name)) continue;
|
|
|
|
|
auto* var_desc = block.FindVar(name);
|
|
|
|
|
if (var_desc == nullptr || var_desc->Persistable()) continue;
|
|
|
|
|
auto type = var_desc->Proto()->type().type();
|
|
|
|
|
if (type != proto::VarType::LOD_TENSOR &&
|
|
|
|
|
type != proto::VarType::SELECTED_ROWS &&
|
|
|
|
|
type != proto::VarType::LOD_TENSOR_ARRAY) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto it = ref_cnts.find(name);
|
|
|
|
|
if (it != ref_cnts.end()) {
|
|
|
|
|
++it->second;
|
|
|
|
|
} else {
|
|
|
|
|
ref_cnts[name] = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for (auto op_desc : block.AllOps()) {
|
|
|
|
|
update_ref_cnts(op_desc, op_desc->Inputs());
|
|
|
|
|
update_ref_cnts(op_desc, op_desc->Outputs());
|
|
|
|
|
}
|
|
|
|
|
return ref_cnts;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ExecutorPrepareContext::ExecutorPrepareContext(
|
|
|
|
|
const framework::ProgramDesc& prog, size_t block_id)
|
|
|
|
|
const framework::ProgramDesc& prog, size_t block_id,
|
|
|
|
|
const std::vector<std::string>& skip_ref_cnt_vars)
|
|
|
|
|
: prog_(prog), block_id_(block_id) {
|
|
|
|
|
if (GetEagerDeletionThreshold() >= 0) {
|
|
|
|
|
ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
|
|
|
|
|
ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
|
|
|
|
|
skip_ref_cnt_vars);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
|
|
|
|
|
VLOG(5) << "destroy ExecutorPrepareContext";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename RefCntMap>
|
|
|
|
|
static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
|
|
|
|
|
GarbageCollector<Tensor>* gc,
|
|
|
|
|
RefCntMap* ref_cnts) {
|
|
|
|
|
static void DeleteUnusedTensors(
|
|
|
|
|
const Scope& scope, const OperatorBase* op, GarbageCollector<Tensor>* gc,
|
|
|
|
|
std::unordered_map<std::string, size_t>* ref_cnts) {
|
|
|
|
|
std::unordered_set<Tensor*> erase_tensors;
|
|
|
|
|
|
|
|
|
|
auto handler = [&](const VariableNameMap& name_map) {
|
|
|
|
@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
|
|
|
|
|
for (auto& name : name_pair.second) {
|
|
|
|
|
auto it = ref_cnts->find(name);
|
|
|
|
|
if (it == ref_cnts->end()) continue;
|
|
|
|
|
if ((it->second)-- == 1) {
|
|
|
|
|
if (--(it->second) == 0) {
|
|
|
|
|
auto* var = scope.FindVar(name);
|
|
|
|
|
if (var != nullptr) {
|
|
|
|
|
VLOG(10) << "Erase tensor \'" << name << "\'";
|
|
|
|
@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
|
|
|
|
|
} else if (var->IsType<SelectedRows>()) {
|
|
|
|
|
erase_tensors.insert(
|
|
|
|
|
var->GetMutable<SelectedRows>()->mutable_value());
|
|
|
|
|
} else if (var->IsType<LoDTensorArray>()) {
|
|
|
|
|
auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
|
|
|
|
|
for (auto& t : *lod_tensor_arr) {
|
|
|
|
|
erase_tensors.insert(&t);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
|
|
|
|
|
const ProgramDesc& program, int block_id) {
|
|
|
|
|
const ProgramDesc& program, int block_id,
|
|
|
|
|
const std::vector<std::string>& skip_ref_cnt_vars) {
|
|
|
|
|
std::unique_ptr<ExecutorPrepareContext> ctx(
|
|
|
|
|
new ExecutorPrepareContext(program, block_id));
|
|
|
|
|
new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
|
|
|
|
|
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
|
|
|
|
|
auto& block = program.Block(block_id);
|
|
|
|
|
for (auto& op_desc : block.AllOps()) {
|
|
|
|
@ -364,16 +407,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
|
|
|
|
|
const ProgramDesc& program, const std::vector<int>& block_ids) {
|
|
|
|
|
const ProgramDesc& program, const std::vector<int>& block_ids,
|
|
|
|
|
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
|
|
|
|
|
PADDLE_ENFORCE(
|
|
|
|
|
skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
|
|
|
|
|
"skip_ref_cnt_vars should be either empty or equals to block number %d",
|
|
|
|
|
block_ids.size());
|
|
|
|
|
std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
|
|
|
|
|
size_t idx = 0;
|
|
|
|
|
for (auto& bid : block_ids) {
|
|
|
|
|
auto* ctx = new ExecutorPrepareContext(program, bid);
|
|
|
|
|
ExecutorPrepareContext* ctx;
|
|
|
|
|
if (skip_ref_cnt_vars.empty()) {
|
|
|
|
|
ctx = new ExecutorPrepareContext(program, bid);
|
|
|
|
|
} else {
|
|
|
|
|
ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
|
|
|
|
|
}
|
|
|
|
|
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
|
|
|
|
|
auto& block = program.Block(bid);
|
|
|
|
|
for (auto& op_desc : block.AllOps()) {
|
|
|
|
|
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
|
|
|
|
|
}
|
|
|
|
|
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
|
|
|
|
|
++idx;
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
|
|
|
|
|
|
|
|
|
|
int64_t max_memory_size = GetEagerDeletionThreshold();
|
|
|
|
|
std::unique_ptr<GarbageCollector<Tensor>> gc;
|
|
|
|
|
// WhileOp would set keep_kids to true,
|
|
|
|
|
// because WhileGradOp needs the scopes created in WhileOp.
|
|
|
|
|
// Perhaps, we should not perform eager deletion in WhileOp
|
|
|
|
|
// The scopes and variables created by WhileOp would be deleted
|
|
|
|
|
// in WhileGradOp.
|
|
|
|
|
if (max_memory_size >= 0 && !keep_kids) {
|
|
|
|
|
if (max_memory_size >= 0) {
|
|
|
|
|
ctx->ResetReferenceCount();
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
if (platform::is_gpu_place(place_)) {
|
|
|
|
|
gc.reset(new DefaultStreamGarbageCollector<Tensor>(
|
|
|
|
|
boost::get<platform::CUDAPlace>(place_), max_memory_size));
|
|
|
|
|
} else {
|
|
|
|
|
if (IsFastEagerDeletionModeEnabled()) {
|
|
|
|
|
gc.reset(new UnsafeFastGPUGarbageCollector<Tensor>(
|
|
|
|
|
boost::get<platform::CUDAPlace>(place_), max_memory_size));
|
|
|
|
|
} else {
|
|
|
|
|
gc.reset(new DefaultStreamGarbageCollector<Tensor>(
|
|
|
|
|
boost::get<platform::CUDAPlace>(place_), max_memory_size));
|
|
|
|
|
}
|
|
|
|
|
} else if (platform::is_cpu_place(place_)) {
|
|
|
|
|
#endif
|
|
|
|
|
gc.reset(new CPUGarbageCollector<Tensor>(
|
|
|
|
|
boost::get<platform::CPUPlace>(place_), max_memory_size));
|
|
|
|
@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
|
|
|
|
|
for (auto& op : ctx->ops_) {
|
|
|
|
|
op->Run(*local_scope, place_);
|
|
|
|
|
|
|
|
|
|
if (gc != nullptr) {
|
|
|
|
|
if (gc) {
|
|
|
|
|
DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
|
|
|
|
|
&(ctx->cur_ref_cnts_));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (gc != nullptr) {
|
|
|
|
|
gc->Wait();
|
|
|
|
|
} else {
|
|
|
|
|
platform::DeviceContextPool::Instance().Get(place_)->Wait();
|
|
|
|
|
}
|
|
|
|
|
platform::DeviceContextPool::Instance().Get(place_)->Wait();
|
|
|
|
|
if (gc) gc->Wait();
|
|
|
|
|
|
|
|
|
|
if (local_scope != scope) {
|
|
|
|
|
scope->DeleteScope(local_scope);
|
|
|
|
|