From 096673f67527b0fed1aab1843041b9d929fd0fb5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 29 Nov 2018 13:20:29 +0000 Subject: [PATCH 01/14] refactor eager deletion test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 12 +- .../details/computation_op_handle.cc | 6 +- .../framework/details/computation_op_handle.h | 6 +- .../details/eager_deletion_op_handle.cc | 117 ++++++++++ .../details/eager_deletion_op_handle.h | 64 ++++++ .../framework/details/eager_deletion_pass.cc | 96 ++++++++ .../framework/details/eager_deletion_pass.h | 32 +++ .../details/multi_devices_graph_pass.cc | 6 +- .../details/reference_count_op_handle.h | 138 ------------ .../framework/details/reference_count_pass.cc | 213 +++++------------- .../framework/details/reference_count_pass.h | 5 - .../details/reference_count_pass_helper.h | 49 ++++ .../scope_buffered_ssa_graph_executor.cc | 30 +-- .../scope_buffered_ssa_graph_executor.h | 4 + paddle/fluid/framework/garbage_collector.h | 12 +- paddle/fluid/framework/ir/graph.h | 11 +- paddle/fluid/framework/ir/pass.h | 11 +- paddle/fluid/framework/parallel_executor.cc | 106 ++++++--- paddle/fluid/framework/parallel_executor.h | 24 +- paddle/fluid/platform/CMakeLists.txt | 9 +- .../fluid/platform/stream_callback_manager.cc | 70 ++++++ .../fluid/platform/stream_callback_manager.h | 51 +---- 22 files changed, 631 insertions(+), 441 deletions(-) create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h delete mode 100644 paddle/fluid/framework/details/reference_count_op_handle.h create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.h create mode 100644 paddle/fluid/platform/stream_callback_manager.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fe..8cf97d667d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,10 +33,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -if (WITH_GPU) - cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -endif() +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) @@ -44,10 +43,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) -if (WITH_GPU) - list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) -endif() +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c60..7beb8c8de9 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -20,11 +20,13 @@ namespace paddle { namespace framework { namespace details { ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, - platform::Place place) + platform::Place place, + size_t scope_idx) : OpHandleBase(node), op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), - place_(place) {} + place_(place), + scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4..601ae4f8c6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,7 +28,8 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t scope_idx); std::string Name() const override; @@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase { void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; } + size_t GetScopeIdx() const { return scope_idx_; } + protected: void RunImpl() override; @@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; + size_t scope_idx_; bool is_lock_and_record_event_free_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc new file mode 100644 index 0000000000..cd26203376 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { +namespace details { + +EagerDeletionOpHandle::EagerDeletionOpHandle( + ir::Node *node, const Scope *scope, const platform::Place &place, + const std::vector &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + dev_ctx_ = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (dynamic_cast *>(gc_)) { + platform::SetDeviceId(boost::get(place).device); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + } +#endif + + for (auto &name : var_names) AddVar(name); +} + +EagerDeletionOpHandle::~EagerDeletionOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + platform::SetDeviceId(gpu_place.device); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif +} + +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } + +void EagerDeletionOpHandle::AddVar(const std::string &name) { + var_names_.insert(name); +} + +void EagerDeletionOpHandle::RunImpl() { + auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + std::vector tensors; + for (auto &name : var_names_) { + auto it = ref_cnts_->find(name); + if (it == ref_cnts_->end()) { + continue; + } + + auto *var = exec_scope->FindVar(name); + if (var == nullptr) { + continue; + } + + if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()->mutable_value()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + tensors.emplace_back(&t); + } + } + } + } + + if (!tensors.empty()) { + ClearTensors(tensors); + } +} + +void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = + static_cast *>(gc_)->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { +#endif + gc_->Add(tensors); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h new file mode 100644 index 0000000000..8254f21bdf --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +class EagerDeletionPass; + +class EagerDeletionOpHandle : public OpHandleBase { + public: + EagerDeletionOpHandle(ir::Node *node, const Scope *scope, + const platform::Place &place, + const std::vector &var_names, + GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts); + + ~EagerDeletionOpHandle(); + + std::string Name() const override; + + protected: + void RunImpl() override; + + private: + void ClearTensors(const std::vector &tensors); + + void AddVar(const std::string &name); + + const Scope *scope_; + std::unordered_set var_names_; + GarbageCollector *gc_; // not own + AtomicReferenceCountMap *ref_cnts_; // not own +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext *dev_ctx_{nullptr}; + cudaEvent_t event_{nullptr}; +#endif + + friend class EagerDeletionPass; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc new file mode 100644 index 0000000000..f877c2881c --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, + ir::Graph *graph) { + auto it = std::find_if( + in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != in->Outputs().end()) { + out->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + in->AddOutput(dep_var); + out->AddInput(dep_var); + } + + // Add leaf node to eager_deletion_node + if (out->Outputs().empty()) { + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + out->AddOutput(dummy_leaf); + } +} + +std::unique_ptr EagerDeletionPass::ApplyImpl( + std::unique_ptr graph) const { + auto &vars = graph->Get(kGraphVars); + + auto &ref_cnts = + Get>(kCurReferenceCount); + auto &last_live_ops = Get>(kLastLiveOpsOfVars); + auto &gcs = Get(kGarbageCollector); + + ref_cnts = std::vector(vars.size()); + + std::unordered_map op_map; + for (auto &var_ops_map : last_live_ops) { + for (auto &var_ops_pair : var_ops_map) { + const std::string &var_name = var_ops_pair.first; + for (ComputationOpHandle *op : var_ops_pair.second) { + auto it = op_map.find(op); + if (it != op_map.end()) { + it->second->AddVar(var_name); + } else { + auto *eager_deletion_node = graph->CreateEmptyNode( + "eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, + gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); + AddDependencyBetween(op, eager_deletion_op, graph.get()); + op_map[op] = eager_deletion_op; + } + } + } + } + VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(eager_deletion_pass, + paddle::framework::details::EagerDeletionPass) + .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h new file mode 100644 index 0000000000..d7a7a9709d --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a36ad25926..97830386e4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -562,7 +562,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), - local_scopes_[dev_id], places_[dev_id])); + local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } @@ -685,8 +685,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get(kGraphOps).emplace_back( - new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + result->Get(kGraphOps).emplace_back(new ComputationOpHandle( + result->CreateOpNode(node->Op()), s, p, scope_idx)); CreateOpHandleIOs(result, node, scope_idx); } } diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h deleted file mode 100644 index cc4ccfbdfc..0000000000 --- a/paddle/fluid/framework/details/reference_count_op_handle.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -using ReferenceCountMap = std::unordered_map; -using AtomicReferenceCountMap = - std::unordered_map>; -using DeviceReferenceCountMap = - std::unordered_map>; -using AtomicDeviceReferenceCountMap = - std::unordered_map>; -using DeviceGarbageCollectorMap = - std::unordered_map>>; - -class ReferenceCountOpHandle : public OpHandleBase { - public: - ReferenceCountOpHandle(ir::Node *node, const Scope *scope, - const platform::CUDAPlace &place, - const std::vector &var_names, - GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { - dev_ctx_ = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - if (IsStreamGarabageCollector()) { - platform::SetDeviceId(place.device); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } - - for (auto &name : var_names) AddVar(name); - } - - ~ReferenceCountOpHandle() { - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } - } - - std::string Name() const override { return "reference_count"; } - - void AddVar(const std::string &name) { - auto it = var_names_.find(name); - if (it != var_names_.end()) - ++(it->second); - else - var_names_[name] = 1; - } - - protected: - void RunImpl() override { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; - for (auto &pair : var_names_) { - auto &name = pair.first; - auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) continue; - - auto *var = exec_scope->FindVar(name); - if (var == nullptr) continue; - - if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back(var->GetMutable()); - } - } else if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back( - var->GetMutable()->mutable_value()); - } - } - } - - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors(const std::vector &tensors) { - auto *gc = dynamic_cast *>(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast *>(gc_) != nullptr; - } - - const Scope *scope_; - platform::CUDADeviceContext *dev_ctx_; - std::unordered_map var_names_; - GarbageCollector *gc_; // not own - AtomicReferenceCountMap *ref_cnts_; // not own - cudaEvent_t event_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 08783fb5f8..f094c7afa9 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -17,184 +17,96 @@ #include #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { -static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { - std::queue queue; - queue.push(var_in); +static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( + OpHandleBase *op, size_t scope_idx) { + std::queue q; + std::unordered_set visited; + q.push(op); do { - auto *var = queue.front(); - queue.pop(); - for (auto *op : var->PendingOps()) { - auto *compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { - return compute_op; - } - for (auto *out_var : op->Outputs()) { - queue.push(out_var); + auto *op = q.front(); + q.pop(); + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + for (auto *pending_op : out_var->PendingOps()) { + if (visited.count(pending_op)) continue; + visited.insert(pending_op); } } - } while (!queue.empty()); + } while (!q.empty()); return nullptr; } -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &ref_cnts = Get(kGlobalReferenceCount); - auto &cur_ref_cnts = Get(kCurReferenceCount); - auto &gcs = Get(kGarbageCollector); - - // It is not easy to find the right reference counts of varaibles in graph - // Step 1: Find all variables in computation ops - // Step 2: Find all variables in non-computation ops which refers to variables - // in computation ops - std::unordered_set names; - std::unordered_map - compute_ref_cnt_map; - - auto get_ref_cnts_from_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - std::vector var_names_in_op; - auto *compute_op = dynamic_cast(op); - if (compute_op == nullptr || - !platform::is_gpu_place(compute_op->GetPlace())) - return var_names_in_op; - auto place = boost::get(compute_op->GetPlace()); - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - if (!platform::is_gpu_place(var_handle->place_) || - boost::get(var_handle->place_) != place) + auto &vars = graph->Get(kGraphVars); + auto &ref_cnts = Get>(kGlobalReferenceCount); + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars); + + last_live_ops_of_vars = std::vector(vars.size()); + ref_cnts = std::vector(vars.size()); + + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + if (name_var_pair.second.empty()) continue; + auto *last_ver_var = name_var_pair.second.back(); + + VarDesc *var_desc = nullptr; + std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), + [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + + if (var_desc == nullptr || var_desc->Persistable()) { continue; - - VarDesc *var_desc = var_handle->Node()->Var(); - auto var_name = var_handle->Node()->Name(); - - // This is weird but there is really some variables without var_desc - // in computation_op - if (var_desc == nullptr) { - var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); - if (var_desc == nullptr) continue; } - if (var_desc->Persistable()) continue; auto var_type = var_desc->Proto()->type().type(); if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { + var_type != proto::VarType::SELECTED_ROWS && + var_type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - // compute op only runs in one device - if (ref_cnts[place.device]->count(var_name)) - ++(*ref_cnts[place.device])[var_name]; - else - (*ref_cnts[place.device])[var_name] = 1; - - names.insert(var_name); - var_names_in_op.push_back(var_name); - } - return var_names_in_op; - }; - - auto update_ref_cnts_from_non_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - if (dynamic_cast(op) != nullptr) return; - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - auto var_name = var_handle->Node()->Name(); - auto var_place = var_handle->place_; - if (!platform::is_gpu_place(var_place)) continue; - auto place = boost::get(var_place); - if (names.count(var_name) == 0) continue; - if (ref_cnts.count(place.device) && - ref_cnts[place.device]->count(var_name)) { - ++(*ref_cnts[place.device])[var_name]; - - auto *next_compute_op = FindNextComputationOpHandle(var_handle); - if (next_compute_op != nullptr) { - if (compute_ref_cnt_map.count(next_compute_op)) { - compute_ref_cnt_map[next_compute_op]->AddVar(var_name); - VLOG(5) << "Add reference count of " << var_name << " to Operator " - << next_compute_op->Name(); - } else { - // Create new reference_count_op_handle - ir::Node *ref_cnt_node = graph->CreateEmptyNode( - "reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[next_compute_op] = ref_cnt_handle; - } + std::unordered_set last_live_op; + auto add_last_live_op = [&](OpHandleBase *op) { + auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); + if (compute_op) { + last_live_op.insert(compute_op); + } + }; + const std::string &var_name = name_var_pair.first; + auto &pending_ops = last_ver_var->PendingOps(); + if (pending_ops.empty()) { + auto *generated_op = last_ver_var->GeneratedOp(); + if (generated_op) { + ref_cnts[i].emplace(var_name, 1); + add_last_live_op(generated_op); + } + } else { + ref_cnts[i].emplace(var_name, pending_ops.size()); + for (auto *pending_op : pending_ops) { + add_last_live_op(pending_op); } } - } - }; - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto &op : all_ops) { - auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); - auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); - if (in_var_names.empty() && out_var_names.empty()) continue; - in_var_names.insert(in_var_names.end(), out_var_names.begin(), - out_var_names.end()); - auto *compute_op = dynamic_cast(op); - auto place = boost::get(compute_op->GetPlace()); - ir::Node *ref_cnt_node = - graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, compute_op->GetScope(), place, in_var_names, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[compute_op] = ref_cnt_handle; - } - - for (auto &op : all_ops) { - update_ref_cnts_from_non_compute_op(op, op->Inputs()); - update_ref_cnts_from_non_compute_op(op, op->Outputs()); - } - - std::vector new_all_ops; - new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); - for (auto &op : all_ops) { - new_all_ops.emplace_back(std::move(op)); - auto it = compute_ref_cnt_map.find(new_all_ops.back()); - if (it != compute_ref_cnt_map.end()) { - // Add LeafNode to ReferenceCountOpHandle - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - it->second->AddOutput(dummy_leaf); - new_all_ops.emplace_back(std::move(it->second)); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } } - - all_ops.swap(new_all_ops); return graph; } @@ -205,5 +117,4 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( REGISTER_PASS(reference_count_pass, paddle::framework::details::ReferenceCountPass) .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) - .RequirePassAttr(paddle::framework::details::kGarbageCollector); + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars); diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index 7081280b06..bcbef02735 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -14,7 +14,6 @@ #pragma once -#include "paddle/fluid/framework/details/reference_count_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -22,10 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kGlobalReferenceCount[] = "reference_count"; -constexpr char kCurReferenceCount[] = "current_reference_count"; -constexpr char kGarbageCollector[] = "garbage_collector"; - class ReferenceCountPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h new file mode 100644 index 0000000000..77846f7bdf --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ComputationOpHandle; + +using ReferenceCountMap = std::unordered_map; + +using AtomicReferenceCountMap = + std::unordered_map>; + +using GarbageCollectorList = + std::vector>>; + +const char kGlobalReferenceCount[] = "reference_count"; +const char kCurReferenceCount[] = "current_reference_count"; +const char kGarbageCollector[] = "garbage_collector"; + +using LastLiveOpsOfVars = + std::unordered_map>; +const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index e5b1eaa731..f1bf6542a3 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -18,9 +18,6 @@ #include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_op_handle.h" -#endif namespace paddle { namespace framework { @@ -33,7 +30,11 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) {} + places_(std::move(places)) { + if (Graph().Has(details::kGarbageCollector)) { + gc_ = &(Graph().Get(details::kGarbageCollector)); + } +} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -69,27 +70,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; -#ifdef PADDLE_WITH_CUDA - const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; -#endif - if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); -#ifdef PADDLE_WITH_CUDA - if (gc != nullptr && platform::is_gpu_place(p)) { - auto gpu_place = boost::get(p); - auto &gc_at_place = gc->at(gpu_place.device); - gc_at_place->Wait(); - gc_at_place->Reset(); + for (size_t i = 0; i < places_.size(); ++i) { + platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); + if (gc_) { + (*gc_)[i]->Wait(); + (*gc_)[i]->Reset(); } -#endif } for (auto &scope : local_scopes_) { auto &local_scope = diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50..ce3061d6e6 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,9 +21,11 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" + namespace paddle { namespace framework { namespace details { @@ -55,6 +57,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; + + GarbageCollectorList* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 818b3334ea..cbe8f606ef 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -65,7 +65,7 @@ class GarbageCollector { if (clear_deque != nullptr) { callback(); - ClearCallback([=]() { + ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); }); } @@ -109,7 +109,6 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - this->dev_ctx_->Wait(); static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -127,14 +126,14 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } @@ -148,8 +147,11 @@ class StreamGarbageCollector : public GarbageCollector { cudaStream_t stream() const { return stream_; } protected: + // ClearCallback and Wait()/Reset() cannot be call in multiple threads + // But it is not important, because they would not be called in multiple + // threads + // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { - std::lock_guard guard(this->mutex_); callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 947c934f0f..7a2560c14d 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -73,14 +73,21 @@ class Graph { } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } template AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } template diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index a3559247db..27746ff145 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -51,11 +51,18 @@ class Pass { AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), "%s attr not registered for pass.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } void Erase(const std::string &attr_name) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..e71f93beef 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -49,6 +50,15 @@ class ParallelExecutorPrivate { } } } + + void ResetRuntimeReferenceCount() { + for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { + for (auto &pair : rt_ref_cnts_[i]) { + rt_cur_ref_cnts_[i][pair.first] = pair.second; + } + } + } + std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -60,6 +70,13 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + + // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then + // keeps unchanged + // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ + std::vector rt_ref_cnts_; + std::vector rt_cur_ref_cnts_; + details::GarbageCollectorList gcs_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -128,35 +145,56 @@ ParallelExecutor::ParallelExecutor( std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); +#else + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); +#endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - for (auto &place : member_->places_) { - if (!platform::is_gpu_place(place)) continue; - auto gpu_place = boost::get(place); - if (gcs_[gpu_place.device] == nullptr) { - ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap()); - cur_ref_cnts_[gpu_place.device].reset( - new details::AtomicReferenceCountMap()); - gcs_[gpu_place.device].reset( - new StreamGarbageCollector(gpu_place, max_memory_size)); + size_t place_num = member_->places_.size(); + for (size_t i = 0; i < place_num; ++i) { + auto &place = member_->places_[i]; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + member_->gcs_.emplace_back(new CPUGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA } - } - if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); +#endif } } -#else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - params, member_->local_scopes_, member_->use_cuda_); -#endif + + if (!member_->gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &(member_->rt_ref_cnts_)); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + VLOG(10) << "ReferenceCountPass Applied"; + graph = ref_cnt_pass->Apply(std::move(graph)); + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, + &(member_->rt_cur_ref_cnts_)); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, + &(member_->gcs_)); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars @@ -271,18 +309,16 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); -#ifdef PADDLE_WITH_CUDA - if (!gcs_.empty()) { - ResetReferenceCount(); - for (auto &pair : cur_ref_cnts_) { - auto &name_map = *(pair.second); + if (!member_->gcs_.empty()) { + member_->ResetRuntimeReferenceCount(); + size_t n = member_->rt_ref_cnts_.size(); + for (size_t i = 0; i < n; ++i) { for (auto &fetch_name : fetch_tensors) { - name_map.erase(fetch_name); + member_->rt_cur_ref_cnts_[i].erase(fetch_name); } - name_map.erase(fetched_var_name); + member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); } } -#endif auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; @@ -326,13 +362,11 @@ ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - // member_ must be destructed before gcs_ since the destructor of - // ReferenceCountOpHandle use raw pointers of gcs_ inside. - member_.reset(); + delete member_; } } // namespace framework } // namespace paddle -#ifdef PADDLE_WITH_CUDA + USE_PASS(reference_count_pass); -#endif +USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2a..1fc17a0d64 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include #include @@ -29,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_pass.h" -#endif - namespace paddle { namespace framework { @@ -75,24 +70,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - std::unique_ptr member_; - -#ifdef PADDLE_WITH_CUDA - // ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_ - details::DeviceReferenceCountMap ref_cnts_; - details::AtomicDeviceReferenceCountMap cur_ref_cnts_; - details::DeviceGarbageCollectorMap gcs_; - - void ResetReferenceCount() { - for (auto &pair1 : ref_cnts_) { - for (auto &pair2 : *(pair1.second)) { - (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second; - } - } - } -#endif + ParallelExecutorPrivate *member_; }; } // namespace framework diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 93cb5eb2dc..23c7ebe842 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -56,9 +56,16 @@ ELSE() set(MKLDNN_CTX_DEPS) ENDIF() +nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +IF(WITH_GPU) + set(STREAM_CALLBACK_DEPS stream_callback_manager) +ELSE() + set(STREAM_CALLBACK_DEPS) +ENDIF() + # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc new file mode 100644 index 0000000000..ae915365f8 --- /dev/null +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/stream_callback_manager.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +struct StreamCallbackContext { + inline StreamCallbackContext(const StreamCallbackManager *manager, + std::function callback) + : manager_(manager), callback_(std::move(callback)) {} + + const StreamCallbackManager *manager_; // do not own + std::function callback_; +}; + +StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) + : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + +void StreamCallbackManager::AddCallback(std::function callback) const { + auto *stream_callback_context = + new StreamCallbackContext(this, std::move(callback)); +#if CUDA_VERSION >= 10000 + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); +#else + PADDLE_ENFORCE( + cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); +#endif +} + +void StreamCallbackManager::Wait() const { + thread_pool_.reset(new ::ThreadPool(1)); +} + +#if CUDA_VERSION >= 10000 +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) +#else +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, + void *user_data) +#endif +{ + auto *callback_context_ptr = + reinterpret_cast(user_data); + callback_context_ptr->manager_->thread_pool_->enqueue( + [callback_context_ptr]() { + std::unique_ptr callback_context( + callback_context_ptr); + callback_context->callback_(); + }); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index ed8734c98c..eac4806d13 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -19,66 +19,29 @@ #include #include #include -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -class StreamCallbackManager; - -struct StreamCallbackContext { - template - inline StreamCallbackContext(const StreamCallbackManager *manager, - Callback &&callback) - : manager_(manager), callback_(callback) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; - +// NOTE(zjl): clean StreamCallback to make compilation faster class StreamCallbackManager { public: - explicit inline StreamCallbackManager(cudaStream_t stream = nullptr) - : stream_(stream), thread_pool_(new ThreadPool(1)) {} + explicit StreamCallbackManager(const cudaStream_t stream); - template - inline void AddCallback(Callback &&callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::forward(callback)); -#if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); // NOLINT -#else - PADDLE_ENFORCE(cudaStreamAddCallback( - stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); // NOLINT -#endif - } + void AddCallback(std::function callback) const; - void Wait() const { thread_pool_.reset(new ThreadPool(1)); } + void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr thread_pool_; + mutable std::unique_ptr<::ThreadPool> thread_pool_; -// cudaStreamCallback cannot call CUDA API inside, so we have to use -// thread_pool here #if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data) + static void CUDART_CB StreamCallbackFunc(void *user_data); #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data) + cudaError_t status, void *user_data); #endif - { - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue([=]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); - } }; } // namespace platform From c47c451a007f33078bfb8f38be4a6cd50922f361 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 11:45:53 +0000 Subject: [PATCH 02/14] fix bug --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/computation_op_handle.cc | 2 + .../details/eager_deletion_op_handle.cc | 23 ++-- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 81 ++++++------ .../fluid/framework/details/op_graph_view.h | 29 +++- .../framework/details/reference_count_pass.cc | 125 ++++++++++++++++-- .../scope_buffered_ssa_graph_executor.cc | 21 ++- .../scope_buffered_ssa_graph_executor.h | 2 + paddle/fluid/framework/executor.cc | 104 +++++++++++---- paddle/fluid/framework/executor.h | 51 ++----- paddle/fluid/framework/garbage_collector.h | 44 +++--- paddle/fluid/framework/operator.cc | 2 + paddle/fluid/framework/parallel_executor.cc | 13 +- paddle/fluid/framework/scope.cc | 6 + paddle/fluid/framework/scope.h | 1 + paddle/fluid/framework/tensor.h | 2 +- .../fluid/operators/controlflow/while_op.cc | 44 +++++- paddle/fluid/operators/reader/ctr_reader.h | 12 +- paddle/fluid/platform/device_context.h | 10 +- .../fluid/platform/stream_callback_manager.cc | 67 +++++----- .../fluid/platform/stream_callback_manager.h | 20 +-- paddle/fluid/pybind/tensor_py.h | 12 +- python/paddle/fluid/__init__.py | 5 +- 24 files changed, 458 insertions(+), 228 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8cf97d667d..8049f5d3f7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -35,7 +35,7 @@ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_e cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7beb8c8de9..2bf43fd4e0 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,6 +31,8 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); + VLOG(10) << "Run Op" << Name(); + auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index cd26203376..41f616035d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/cuda_device_guard.h" namespace paddle { namespace framework { @@ -23,28 +24,32 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { + const std::unordered_set &var_names, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), + scope_(scope), + var_names_(var_names), + gc_(gc), + ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { dev_ctx_ = static_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast *>(gc_)) { - platform::SetDeviceId(boost::get(place).device); + platform::CUDADeviceGuard guard( + boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_); } } #endif - - for (auto &name : var_names) AddVar(name); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { #ifdef PADDLE_WITH_CUDA if (event_) { auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); + platform::CUDADeviceGuard guard(gpu_place.device); PADDLE_ENFORCE(cudaEventDestroy(event_)); } #endif @@ -52,10 +57,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } -void EagerDeletionOpHandle::AddVar(const std::string &name) { - var_names_.insert(name); -} - void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); std::vector tensors; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 8254f21bdf..d8de59cc4d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -25,13 +25,11 @@ class Scope; namespace details { -class EagerDeletionPass; - class EagerDeletionOpHandle : public OpHandleBase { public: EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, + const std::unordered_set &var_names, GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); @@ -45,8 +43,6 @@ class EagerDeletionOpHandle : public OpHandleBase { private: void ClearTensors(const std::vector &tensors); - void AddVar(const std::string &name); - const Scope *scope_; std::unordered_set var_names_; GarbageCollector *gc_; // not own @@ -55,8 +51,6 @@ class EagerDeletionOpHandle : public OpHandleBase { platform::CUDADeviceContext *dev_ctx_{nullptr}; cudaEvent_t event_{nullptr}; #endif - - friend class EagerDeletionPass; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index f877c2881c..3a1b37e533 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -26,62 +26,61 @@ namespace paddle { namespace framework { namespace details { -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } - - // Add leaf node to eager_deletion_node - if (out->Outputs().empty()) { - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - out->AddOutput(dummy_leaf); - } -} - std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); + const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kCurReferenceCount); - auto &last_live_ops = Get>(kLastLiveOpsOfVars); + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); auto &gcs = Get(kGarbageCollector); ref_cnts = std::vector(vars.size()); - std::unordered_map op_map; + std::unordered_map> + op_vars_map; + for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; - for (ComputationOpHandle *op : var_ops_pair.second) { - auto it = op_map.find(op); - if (it != op_map.end()) { - it->second->AddVar(var_name); - } else { - auto *eager_deletion_node = graph->CreateEmptyNode( - "eager_deletion", ir::Node::Type::kOperation); - auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, - gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); - AddDependencyBetween(op, eager_deletion_op, graph.get()); - op_map[op] = eager_deletion_op; - } + for (auto *op : var_ops_pair.second) { + op_vars_map[op].insert(var_name); } } } - VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + + for (auto &pair : op_vars_map) { + auto *op = pair.first; + auto &var_names = pair.second; + + auto *eager_deletion_node = + graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), + std::move(var_names), gcs[op->GetScopeIdx()].get(), + &(ref_cnts[op->GetScopeIdx()])); + + auto it = std::find_if( + op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != op->Outputs().end()) { + eager_deletion_op->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + op->AddOutput(dep_var); + eager_deletion_op->AddInput(dep_var); + } + + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + eager_deletion_op->AddOutput(dummy_leaf); + } + + VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; return graph; } diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h index afb3e8e594..77aa02eba5 100644 --- a/paddle/fluid/framework/details/op_graph_view.h +++ b/paddle/fluid/framework/details/op_graph_view.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include #include @@ -34,6 +34,11 @@ class OpGraphView { bool HasOp(OpHandleBase *op) const; + // Use a visitor to visit all pending ops of op + // Stop when callback returns false + template + bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const; + private: void Build(const std::vector &ops); void EnforceHasOp(OpHandleBase *op) const; @@ -44,6 +49,28 @@ class OpGraphView { pending_ops_; }; +template +bool OpGraphView::VisitAllPendingOps(OpHandleBase *op, + Callback &&callback) const { + EnforceHasOp(op); + std::unordered_set visited; + std::queue q; + q.push(op); + do { + op = q.front(); + q.pop(); + for (auto &pending_op : pending_ops_.at(op)) { + if (visited.count(pending_op) == 0) { + visited.insert(pending_op); + if (!callback(pending_op)) { + return false; + } + } + } + } while (!q.empty()); + return true; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f094c7afa9..2320d3926a 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -14,11 +14,13 @@ #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/reference_count_pass.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -27,6 +29,89 @@ namespace paddle { namespace framework { namespace details { +struct OpConnectionDetector { + public: + enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; + + explicit OpConnectionDetector(const std::vector &all_ops) + : graph_(all_ops) {} + + template + std::unordered_set MaxNoDepOps( + const OpSet &op_set) { + using KeyType = typename OpSet::key_type; + static_assert( + std::is_base_of::type>::value, + "Key type of OpSet must be or derived of OpHandleBase"); + + std::vector ops(op_set.begin(), op_set.end()); + std::unordered_set ret; + auto rels = GetRelations(ops); + auto not_before = [](RelationShip r) { return r != kBefore; }; + for (size_t i = 0; i < rels.size(); ++i) { + if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) { + ret.insert(static_cast(ops[i])); + } + } + return ret; + } + + private: + std::vector> GetRelations( + const std::vector ops) { + std::unordered_map op_to_idx; + for (size_t i = 0; i < ops.size(); ++i) { + PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); + op_to_idx[ops[i]] = i; + } + + PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops"); + + std::vector> ret(ops.size()); + for (auto &e : ret) { + e.assign(ops.size(), kSame); + } + + size_t found_num = ops.size(); + size_t total_num = ops.size() * ops.size(); + auto visitor = [&](OpHandleBase *op, size_t i) { + auto it = op_to_idx.find(op); + if (it != op_to_idx.end()) { + size_t j = it->second; + if (ret[i][j] != kSame) { + ret[i][j] = kBefore; + ret[j][i] = kAfter; + found_num += 2; + if (found_num == total_num) { + return false; + } + } + } + return true; + }; + + for (size_t i = 0; i < ops.size(); ++i) { + auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); }; + if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) { + break; + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + for (size_t j = i + 1; j < ops.size(); ++j) { + if (ret[i][j] != kSame) continue; + ret[i][j] = kNoDeps; + ret[j][i] = kNoDeps; + } + } + + return ret; + } + + const OpGraphView graph_; +}; + static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -59,9 +144,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); + OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) continue; + if (name_var_pair.second.empty()) { + continue; + } + + const std::string &var_name = name_var_pair.first; auto *last_ver_var = name_var_pair.second.back(); VarDesc *var_desc = nullptr; @@ -83,30 +174,46 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( } std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) { + auto add_last_live_op = [&](OpHandleBase *op) -> bool { auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); if (compute_op) { last_live_op.insert(compute_op); + return true; + } else { + return false; } }; - const std::string &var_name = name_var_pair.first; + + bool can_delete = false; auto &pending_ops = last_ver_var->PendingOps(); if (pending_ops.empty()) { auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op) { - ref_cnts[i].emplace(var_name, 1); - add_last_live_op(generated_op); + if (generated_op && add_last_live_op(generated_op)) { + can_delete = true; } } else { - ref_cnts[i].emplace(var_name, pending_ops.size()); + can_delete = true; for (auto *pending_op : pending_ops) { - add_last_live_op(pending_op); + if (!add_last_live_op(pending_op)) { + can_delete = false; + break; + } } } - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (can_delete) { + size_t original_size = last_live_op.size(); + last_live_op = detector.MaxNoDepOps(last_live_op); + if (last_live_op.size() != original_size) { + VLOG(10) << "Shrink last living op number of " << var_name << " from " + << original_size << " to " << last_live_op.size(); + } + ref_cnts[i].emplace(var_name, last_live_op.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + } } } + return graph; } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index f1bf6542a3..0cc3ac8bfb 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -36,6 +36,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( } } +void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { + if (gc_) { + for (auto &gc : *gc_) { + gc->Wait(); + gc->Reset(); + } + } +} + FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { @@ -74,19 +83,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (size_t i = 0; i < places_.size(); ++i) { - platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); - if (gc_) { - (*gc_)[i]->Wait(); - (*gc_)[i]->Reset(); - } + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); } + WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } + } else { + WaitAllGarbageCollectors(); } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index ce3061d6e6..4d52183a20 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -50,6 +50,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: + void WaitAllGarbageCollectors(); + size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 96132a2c18..02d1e4114e 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -37,11 +37,49 @@ namespace { int kProgramId = -1; } // namespace +static std::unordered_map GetNonPersistableReferenceCounts( + const BlockDesc& block, const std::vector& skip_var_list) { + std::unordered_map ref_cnts; + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + if (skip_vars.count(name)) continue; + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS && + type != proto::VarType::LOD_TENSOR_ARRAY) { + continue; + } + + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; + } + } + } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); + } + return ref_cnts; +} + ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id) + const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCount(prog_, block_id_); + ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -template -static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, - GarbageCollector* gc, - RefCntMap* ref_cnts) { +static void DeleteUnusedTensors( + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + std::unordered_map* ref_cnts) { std::unordered_set erase_tensors; auto handler = [&](const VariableNameMap& name_map) { @@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if ((it->second)-- == 1) { + if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { VLOG(10) << "Erase tensor \'" << name << "\'"; @@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, } else if (var->IsType()) { erase_tensors.insert( var->GetMutable()->mutable_value()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + erase_tensors.insert(&t); + } } } } @@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } std::unique_ptr Executor::Prepare( - const ProgramDesc& program, int block_id) { + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars) { std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id)); + new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -364,16 +407,28 @@ std::unique_ptr Executor::Prepare( } std::vector> Executor::Prepare( - const ProgramDesc& program, const std::vector& block_ids) { + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars) { + PADDLE_ENFORCE( + skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), + "skip_ref_cnt_vars should be either empty or equals to block number %d", + block_ids.size()); std::vector> result; + size_t idx = 0; for (auto& bid : block_ids) { - auto* ctx = new ExecutorPrepareContext(program, bid); + ExecutorPrepareContext* ctx; + if (skip_ref_cnt_vars.empty()) { + ctx = new ExecutorPrepareContext(program, bid); + } else { + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } result.push_back(std::shared_ptr(ctx)); + ++idx; } return result; } @@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; - // WhileOp would set keep_kids to true, - // because WhileGradOp needs the scopes created in WhileOp. - // Perhaps, we should not perform eager deletion in WhileOp - // The scopes and variables created by WhileOp would be deleted - // in WhileGradOp. - if (max_memory_size >= 0 && !keep_kids) { + if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { - gc.reset(new DefaultStreamGarbageCollector( - boost::get(place_), max_memory_size)); - } else { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place_), max_memory_size)); + } else { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } + } else if (platform::is_cpu_place(place_)) { #endif gc.reset(new CPUGarbageCollector( boost::get(place_), max_memory_size)); @@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); - if (gc != nullptr) { + if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), &(ctx->cur_ref_cnts_)); } } - if (gc != nullptr) { - gc->Wait(); - } else { - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c2..f00d4314b6 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -28,42 +28,11 @@ namespace paddle { namespace framework { extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); -template -std::unordered_map GetNonPersistableReferenceCount( - const ProgramDesc& prog, size_t block_id) { - auto& block = prog.Block(block_id); - std::unordered_map ref_cnts; - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS) { - continue; - } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); + ~ExecutorPrepareContext(); void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } @@ -72,8 +41,8 @@ struct ExecutorPrepareContext { size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map ref_cnts_; + std::unordered_map cur_ref_cnts_; }; class Executor { @@ -109,10 +78,14 @@ class Executor { const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( - const ProgramDesc& program, int block_id); + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); static std::vector> Prepare( - const ProgramDesc& program, const std::vector& block_ids); + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars = + std::vector>()); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index cbe8f606ef..1382e0d461 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -19,6 +19,9 @@ #include #include #include // NOLINT +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -36,6 +39,11 @@ class GarbageCollector { virtual ~GarbageCollector() {} + size_t NumOfGarbages() const { + std::lock_guard guard(mutex_); + return garbages_->size(); + } + void Reset() { std::lock_guard guard(mutex_); garbages_.reset(new std::deque()); @@ -49,7 +57,7 @@ class GarbageCollector { template void Add(const Container &objs, Callback &&callback) { - std::shared_ptr> clear_deque; + std::deque *clear_deque = nullptr; { std::lock_guard guard(mutex_); for (auto *obj : objs) { @@ -58,7 +66,7 @@ class GarbageCollector { } if (cur_memory_size_ >= max_memory_size_) { cur_memory_size_ = 0; - clear_deque = garbages_; + clear_deque = garbages_.release(); garbages_.reset(new std::deque()); } } @@ -67,6 +75,7 @@ class GarbageCollector { callback(); ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); + delete clear_deque; }); } } @@ -77,7 +86,7 @@ class GarbageCollector { virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::shared_ptr> garbages_; + std::unique_ptr> garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; size_t cur_memory_size_ = 0; @@ -96,6 +105,19 @@ class CPUGarbageCollector : public GarbageCollector { }; #ifdef PADDLE_WITH_CUDA +template +class UnsafeFastGPUGarbageCollector : public GarbageCollector { + public: + UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + + protected: + void ClearCallback(const std::function &callback) override { + callback(); + } +}; + template class DefaultStreamGarbageCollector : public GarbageCollector { public: @@ -109,7 +131,7 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - static_cast(this->dev_ctx_) + static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -126,31 +148,23 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } - void Wait() const override { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - std::lock_guard guard(this->mutex_); - callback_manager_->Wait(); - } + void Wait() const override { callback_manager_->Wait(); } cudaStream_t stream() const { return stream_; } protected: - // ClearCallback and Wait()/Reset() cannot be call in multiple threads - // But it is not important, because they would not be called in multiple - // threads - // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8bfdf38912..a5f714fc89 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -873,6 +873,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", + ipt_name, DebugString()); int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e71f93beef..3d466e44a1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -158,8 +158,13 @@ ParallelExecutor::ParallelExecutor( auto &place = member_->places_[i]; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); + if (IsFastEagerDeletionModeEnabled()) { + member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); + } else { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; } else if (platform::is_cpu_place(place)) { #endif @@ -181,8 +186,8 @@ ParallelExecutor::ParallelExecutor( &(member_->rt_ref_cnts_)); ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); - VLOG(10) << "ReferenceCountPass Applied"; graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; auto eager_deletion_pass = ir::PassRegistry::Instance().Get("eager_deletion_pass"); @@ -194,6 +199,8 @@ ParallelExecutor::ParallelExecutor( &last_live_ops_of_vars); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); } // Step 3. Create vars in each scope. Passes may also create new vars. diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7cc..cb3b6cdc3e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,6 +38,10 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57..aded1f771c 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -27,6 +27,7 @@ namespace paddle { namespace framework { int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); class Scope; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b..3a4c52410e 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_ = nullptr; } + void clear() { holder_.reset(); } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 6c1b2f329a..d8410b4058 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -59,7 +59,21 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto ctx = executor.Prepare(*program, block->ID()); + auto &skip_eager_deletion_vars = + Attr>("skip_eager_deletion_vars"); + if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { + std::string debug_string = + "Skip " + std::to_string(skip_eager_deletion_vars.size()) + + " vars in eager deletion mode: "; + for (auto &var : skip_eager_deletion_vars) { + debug_string.append(var); + debug_string.push_back(' '); + } + VLOG(10) << debug_string; + } + + auto ctx = + executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -96,6 +110,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr>("skip_eager_deletion_vars", + "Vars that would skip eager deletion." + "Users should not set this manually.") + .SetDefault(std::vector()); AddComment(R"DOC( )DOC"); } @@ -341,6 +359,30 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); + /* The following codes are used in eager deletion mode */ + if (framework::GetEagerDeletionThreshold() >= 0) { + std::unordered_set skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + // If input var of ops inside grad_block is not from grad_block, + // it cannot be deleted when forward while_op runs + if (in_arg_name != framework::kEmptyVarName && + !grad_block->HasVar(in_arg_name)) { + skip_vars.insert(in_arg_name); + } + } + } + + if (!skip_vars.empty()) { + // FIXME(zjl): ugly const_cast here, maybe we should find a better way + // to modify forward while_op + auto &fwd_while_op = const_cast(ForwardOp()); + fwd_while_op.SetAttr( + "skip_eager_deletion_vars", + std::vector(skip_vars.begin(), skip_vars.end())); + } + } + return std::unique_ptr(while_grad); } }; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae1..7fc07efe73 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -16,6 +16,7 @@ #include +#include #include // NOLINT #include #include @@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + thread_num_ = std::min(file_list_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { - read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[thread_id], slots_, batch_size_, + static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3edd727978..37453a8c29 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -223,14 +223,10 @@ class CUDADeviceContext : public DeviceContext { template void AddStreamCallback(Callback&& callback) const { - std::lock_guard guard(callback_mtx_); callback_manager_->AddCallback(callback); } - void WaitStreamCallback() const { - std::lock_guard guard(callback_mtx_); - callback_manager_->Wait(); - } + void WaitStreamCallback() const { callback_manager_->Wait(); } #if CUDA_VERSION >= 9000 /*! \brief CublasCall may need to change cublas's config, @@ -261,9 +257,7 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex mtx_; - // This lock is only used by callback - // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes - mutable std::mutex callback_mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; mutable std::mutex cublas_mtx_; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index ae915365f8..58ec6f2f5d 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -18,52 +18,47 @@ namespace paddle { namespace platform { -struct StreamCallbackContext { - inline StreamCallbackContext(const StreamCallbackManager *manager, - std::function callback) - : manager_(manager), callback_(std::move(callback)) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; +#if CUDA_VERSION >= 10000 +static void CUDART_CB StreamCallbackFunc(void *user_data); +#else +static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, void *user_data) +#endif +{ + std::unique_ptr> func( + reinterpret_cast *>(user_data)); + (*func)(); +} StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) - : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + : stream_(stream), thread_pool_(1) {} void StreamCallbackManager::AddCallback(std::function callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::move(callback)); + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); #if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); #else - PADDLE_ENFORCE( - cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); + PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif } -void StreamCallbackManager::Wait() const { - thread_pool_.reset(new ::ThreadPool(1)); -} +StreamCallbackManager::~StreamCallbackManager() { Wait(); } -#if CUDA_VERSION >= 10000 -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) -#else -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, - void *user_data) -#endif -{ - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue( - [callback_context_ptr]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); +void StreamCallbackManager::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } } } // namespace platform diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index eac4806d13..0d5d85bf46 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,30 +18,32 @@ #include #include #include +#include // NOLINT #include +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -// NOTE(zjl): clean StreamCallback to make compilation faster +// NOTE(zjl): clean StreamCallbackManager to make compilation faster +// Make StreamCallbackManager thread-safe class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); + ~StreamCallbackManager(); + void AddCallback(std::function callback) const; void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr<::ThreadPool> thread_pool_; - -#if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data); -#else - static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data); -#endif + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; }; } // namespace platform diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 02a75236f6..24800e1709 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f7fefb3e5b..2690149e9b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,8 +116,9 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", - 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From 35a2578426840642acc0b2100be0b1c96c2cf1e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 13:21:49 +0000 Subject: [PATCH 03/14] fix bug test=develop --- .../framework/details/computation_op_handle.cc | 2 -- .../framework/details/reference_count_pass.cc | 14 +++++++++----- paddle/fluid/platform/stream_callback_manager.cc | 2 -- paddle/fluid/platform/stream_callback_manager.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2bf43fd4e0..7beb8c8de9 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,8 +31,6 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); - VLOG(10) << "Run Op" << Name(); - auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 2320d3926a..0c096e0980 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,7 +29,7 @@ namespace paddle { namespace framework { namespace details { -struct OpConnectionDetector { +class OpConnectionDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; @@ -37,8 +37,8 @@ struct OpConnectionDetector { : graph_(all_ops) {} template - std::unordered_set MaxNoDepOps( - const OpSet &op_set) { + OpSet MaxNoDepOps(const OpSet &op_set) { + if (op_set.size() <= 1) return op_set; using KeyType = typename OpSet::key_type; static_assert( std::is_base_of ops(op_set.begin(), op_set.end()); - std::unordered_set ret; + OpSet ret; auto rels = GetRelations(ops); auto not_before = [](RelationShip r) { return r != kBefore; }; for (size_t i = 0; i < rels.size(); ++i) { @@ -79,7 +79,7 @@ struct OpConnectionDetector { auto it = op_to_idx.find(op); if (it != op_to_idx.end()) { size_t j = it->second; - if (ret[i][j] != kSame) { + if (i != j && ret[i][j] == kSame) { ret[i][j] = kBefore; ret[j][i] = kAfter; found_num += 2; @@ -208,6 +208,10 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( VLOG(10) << "Shrink last living op number of " << var_name << " from " << original_size << " to " << last_live_op.size(); } + + PADDLE_ENFORCE(!last_live_op.empty(), + "Last living ops of %s cannot be empty", var_name); + ref_cnts[i].emplace(var_name, last_live_op.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 58ec6f2f5d..466c77469e 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -49,8 +49,6 @@ void StreamCallbackManager::AddCallback(std::function callback) const { #endif } -StreamCallbackManager::~StreamCallbackManager() { Wait(); } - void StreamCallbackManager::Wait() const { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); { diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0d5d85bf46..8668bcb113 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -33,7 +33,7 @@ class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); - ~StreamCallbackManager(); + ~StreamCallbackManager() = default; void AddCallback(std::function callback) const; From 2d0d037d8e9e1580d38e800fd1a0d0b0056422eb Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 04/14] fix while_op eager deletion bug add unittest test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 5 files changed, 140 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034..f443c2d8cf 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b4058..da7cad82d8 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 0000000000..7ec1f0ae75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 0000000000..2dcdbdb8f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 0000000000..754d5fd409 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() From e694d0c2e487a854103e0cc4796f92af6d27ccfd Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 05/14] fix while_op eager deletion bug add unittest test=develop --- .../details/eager_deletion_op_handle.cc | 2 + paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 6 files changed, 142 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 41f616035d..54715fed8d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,9 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034..f443c2d8cf 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b4058..da7cad82d8 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 0000000000..7ec1f0ae75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 0000000000..2dcdbdb8f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 0000000000..754d5fd409 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() From d0c8b9b9b350f774a7b195bf6c807b90b5f895f9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 12:00:28 +0000 Subject: [PATCH 06/14] remove timeout unittest test=develop --- paddle/fluid/framework/tensor.h | 2 +- .../test_eager_deletion_seresnext.py | 27 ------------------- 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 3a4c52410e..71e8badd4b 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_.reset(); } + void clear() { holder_ = nullptr; } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py deleted file mode 100644 index 2dcdbdb8f1..0000000000 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" - -from test_parallel_executor_seresnext import TestResnet - - -class EagerDeletionTestSEResNext(TestResnet): - pass - - -if __name__ == '__main__': - unittest.main() From 387bac46b5e4d95e2888773975d1b6c3a906a588 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:09:43 +0000 Subject: [PATCH 07/14] refine code test=develop --- .../framework/details/eager_deletion_pass.cc | 10 +- .../fluid/framework/details/op_graph_view.cc | 2 + .../framework/details/reference_count_pass.cc | 14 +- .../details/reference_count_pass_helper.h | 10 +- .../scope_buffered_ssa_graph_executor.cc | 8 +- .../scope_buffered_ssa_graph_executor.h | 2 +- paddle/fluid/framework/executor.cc | 14 +- paddle/fluid/framework/executor.h | 6 +- paddle/fluid/framework/parallel_executor.cc | 153 ++++++++++-------- .../fluid/operators/controlflow/while_op.cc | 10 +- 10 files changed, 122 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 3a1b37e533..85991c71e6 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -31,10 +31,11 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = - Get>(kCurReferenceCount); + Get>(kRuntimeReferenceCount); const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + auto &gcs = Get(kGarbageCollector); + const auto &places = Get>(kAllPlaces); ref_cnts = std::vector(vars.size()); @@ -58,7 +59,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs[op->GetScopeIdx()].get(), + std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( @@ -90,6 +91,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( REGISTER_PASS(eager_deletion_pass, paddle::framework::details::EagerDeletionPass) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount) .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index 4838c4198f..b6b5ad42c4 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -23,6 +23,8 @@ namespace details { OpGraphView::OpGraphView(const std::vector &ops) { Build(ops); } void OpGraphView::Build(const std::vector &ops) { + preceding_ops_.clear(); + pending_ops_.clear(); for (auto &op : ops) { preceding_ops_[op]; pending_ops_[op]; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c096e0980..f2c9dfb524 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,22 +29,22 @@ namespace paddle { namespace framework { namespace details { -class OpConnectionDetector { +class OpRelationDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpConnectionDetector(const std::vector &all_ops) + explicit OpRelationDetector(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) { - if (op_set.size() <= 1) return op_set; + OpSet MaxNoDepOps(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of::type>::value, - "Key type of OpSet must be or derived of OpHandleBase"); + "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase"); + if (op_set.size() <= 1) return op_set; std::vector ops(op_set.begin(), op_set.end()); OpSet ret; auto rels = GetRelations(ops); @@ -59,7 +59,7 @@ class OpConnectionDetector { private: std::vector> GetRelations( - const std::vector ops) { + const std::vector ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -144,7 +144,7 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); - OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 77846f7bdf..eb534f9701 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -33,12 +34,13 @@ using ReferenceCountMap = std::unordered_map; using AtomicReferenceCountMap = std::unordered_map>; -using GarbageCollectorList = - std::vector>>; +using GarbageCollectorMap = + std::map>>; -const char kGlobalReferenceCount[] = "reference_count"; -const char kCurReferenceCount[] = "current_reference_count"; +const char kGlobalReferenceCount[] = "global_reference_count"; +const char kRuntimeReferenceCount[] = "runtime_reference_count"; const char kGarbageCollector[] = "garbage_collector"; +const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = std::unordered_map>; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index da5e277f27..b8775fc329 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -32,15 +32,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( var_infos_(std::move(var_infos)), places_(std::move(places)) { if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); + gc_ = &(Graph().Get(details::kGarbageCollector)); } } void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { if (gc_) { - for (auto &gc : *gc_) { - gc->Wait(); - gc->Reset(); + for (auto &gc_pair : *gc_) { + gc_pair.second->Wait(); + gc_pair.second->Reset(); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 4d52183a20..6086a219e0 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -60,7 +60,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector var_infos_; std::vector places_; - GarbageCollectorList* gc_{nullptr}; + GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f443c2d8cf..04425a5983 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -56,13 +56,7 @@ static std::unordered_map GetNonPersistableReferenceCounts( type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } + ++ref_cnts[name]; } } }; @@ -79,8 +73,8 @@ ExecutorPrepareContext::ExecutorPrepareContext( const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -443,7 +437,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->cur_ref_cnts_)); + &(ctx->runtime_ref_cnts_)); } } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 412ebd1904..5a040ac641 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -34,14 +34,14 @@ struct ExecutorPrepareContext { ~ExecutorPrepareContext(); - void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } + void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } const framework::ProgramDesc& prog_; size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map global_ref_cnts_; + std::unordered_map runtime_ref_cnts_; }; class Executor { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3d466e44a1..dfd031f119 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -51,11 +51,22 @@ class ParallelExecutorPrivate { } } - void ResetRuntimeReferenceCount() { - for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { - for (auto &pair : rt_ref_cnts_[i]) { - rt_cur_ref_cnts_[i][pair.first] = pair.second; + std::unique_ptr PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size); + + inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + + void ResetRuntimeReferenceCount(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) { + for (auto &pair : global_ref_cnts_[i]) { + runtime_ref_cnts_[i][pair.first] = pair.second; + } + + for (auto &fetch_name : fetch_tensors) { + runtime_ref_cnts_[i].erase(fetch_name); } + runtime_ref_cnts_[i].erase(fetched_var_name); } } @@ -71,14 +82,75 @@ class ParallelExecutorPrivate { bool use_cuda_; bool use_all_reduce_; - // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ - std::vector rt_ref_cnts_; - std::vector rt_cur_ref_cnts_; - details::GarbageCollectorList gcs_; + // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and + // then keeps unchanged + // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_ + std::vector global_ref_cnts_; + std::vector runtime_ref_cnts_; + details::GarbageCollectorMap gcs_; }; +std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size) { + for (size_t i = 0; i < places_.size(); ++i) { + auto &place = places_[i]; + if (gcs_.count(place) > 0) { + continue; + } +#ifdef PADDLE_WITH_CUDA + GarbageCollector *gc = nullptr; + if (platform::is_gpu_place(place)) { + if (IsFastEagerDeletionModeEnabled()) { + gc = new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size); + } else { + gc = new StreamGarbageCollector( + boost::get(place), max_memory_size); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + gc = new CPUGarbageCollector( + boost::get(place), max_memory_size); + VLOG(10) << "Created GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA + } +#endif + + if (gc) { + gcs_[place] = std::unique_ptr>(gc); + } + } + + if (gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &global_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount, + &runtime_ref_cnts_); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &gcs_); + } + + return graph; +} + std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -153,54 +225,8 @@ ParallelExecutor::ParallelExecutor( auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - size_t place_num = member_->places_.size(); - for (size_t i = 0; i < place_num; ++i) { - auto &place = member_->places_[i]; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(place)) { - if (IsFastEagerDeletionModeEnabled()) { - member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size)); - } else { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { -#endif - member_->gcs_.emplace_back(new CPUGarbageCollector( - boost::get(place), max_memory_size)); - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#ifdef PADDLE_WITH_CUDA - } -#endif - } - } - - if (!member_->gcs_.empty()) { - std::vector last_live_ops_of_vars; - - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, - &(member_->rt_ref_cnts_)); - ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(std::move(graph)); - VLOG(10) << "ReferenceCountPass Applied"; - - auto eager_deletion_pass = - ir::PassRegistry::Instance().Get("eager_deletion_pass"); - eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, - &(member_->rt_cur_ref_cnts_)); - eager_deletion_pass->SetNotOwned(details::kGarbageCollector, - &(member_->gcs_)); - eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = eager_deletion_pass->Apply(std::move(graph)); - VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. @@ -316,15 +342,8 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); - if (!member_->gcs_.empty()) { - member_->ResetRuntimeReferenceCount(); - size_t n = member_->rt_ref_cnts_.size(); - for (size_t i = 0; i < n; ++i) { - for (auto &fetch_name : fetch_tensors) { - member_->rt_cur_ref_cnts_[i].erase(fetch_name); - } - member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); - } + if (member_->HasGarbageCollectors()) { + member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name); } auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index da7cad82d8..06920a47ee 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -74,9 +74,7 @@ class WhileOp : public framework::OperatorBase { bool is_test = Attr("is_test"); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { @@ -144,9 +142,7 @@ class WhileGradOp : public framework::OperatorBase { auto *program = block->Program(); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = @@ -369,7 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The followi_ng codes are used in eager deletion mode */ + /* The following codes are used in eager deletion mode */ std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { std::unordered_set fwd_skip_vars; From 644baa2e45b64f5a52e237ca1981cb30a5043e0c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 08/14] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f119..fd2bcb8848 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( From 8095fb5e686d3e32f1838dfe7fbf4d0108ef1f25 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 09/14] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f119..e51b1f1f73 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( @@ -122,7 +122,7 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( } } - if (gcs_.empty()) { + if (!gcs_.empty()) { std::vector last_live_ops_of_vars; auto ref_cnt_pass = From eb8252466b11bdbea7abca6fd4cc5816f1c30830 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 09:15:23 +0000 Subject: [PATCH 10/14] polish code add unittest model containing while_op remove unnecessary codes test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/details/CMakeLists.txt | 5 +- .../details/eager_deletion_op_handle.cc | 48 +++--- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 18 +- .../fluid/framework/details/op_graph_view.cc | 1 + .../framework/details/reference_count_pass.cc | 156 +++++++++++------- .../details/reference_count_pass_helper.cc | 21 +++ .../details/reference_count_pass_helper.h | 4 +- .../scope_buffered_ssa_graph_executor.cc | 21 +-- .../scope_buffered_ssa_graph_executor.h | 6 - paddle/fluid/framework/executor.cc | 56 ++++--- paddle/fluid/framework/garbage_collector.cc | 89 ++++++++++ paddle/fluid/framework/garbage_collector.h | 153 ++++++----------- paddle/fluid/framework/parallel_executor.cc | 28 ++-- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/tensor.h | 4 + .../unittests/test_eager_deletion_gru_net.py | 49 ++++++ .../unittests/test_eager_deletion_lstm_net.py | 111 +++++++++++++ 19 files changed, 516 insertions(+), 268 deletions(-) create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.cc create mode 100644 paddle/fluid/framework/garbage_collector.cc create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad63..f2361c5cea 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) + cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -164,7 +166,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8049f5d3f7..a6c8ef408a 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,9 +33,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 54715fed8d..3b27415e43 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -26,8 +26,8 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::unordered_set &var_names, - GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + const std::unordered_set &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) : OpHandleBase(node), scope_(scope), var_names_(var_names), @@ -35,9 +35,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - dev_ctx_ = static_cast( + dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); - if (dynamic_cast *>(gc_)) { + if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard( boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -61,10 +61,11 @@ std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; + std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) { + // Var not found, not reference count has not decreased to 0 + if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } @@ -73,43 +74,44 @@ void EagerDeletionOpHandle::RunImpl() { continue; } + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()); - } + garbages.emplace_back(var->GetMutable()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()->mutable_value()); - } + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - auto *tensor_arr = var->GetMutable(); - for (auto &t : *tensor_arr) { - tensors.emplace_back(&t); - } + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } - if (!tensors.empty()) { - ClearTensors(tensors); + if (!garbages.empty()) { + ClearGarbages(&garbages); } } -void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +void EagerDeletionOpHandle::ClearGarbages( + std::deque> *garbages) { #ifdef PADDLE_WITH_CUDA if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = - static_cast *>(gc_)->stream(); + reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); }; - gc_->Add(tensors, callback_func); + gc_->Add(std::move(*garbages), callback_func); } else { #endif - gc_->Add(tensors); + gc_->Add(std::move(*garbages)); #ifdef PADDLE_WITH_CUDA } #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index d8de59cc4d..64867afad5 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -14,8 +14,8 @@ #pragma once +#include #include -#include #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" @@ -30,7 +30,7 @@ class EagerDeletionOpHandle : public OpHandleBase { EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, const std::unordered_set &var_names, - GarbageCollector *gc, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); ~EagerDeletionOpHandle(); @@ -41,11 +41,11 @@ class EagerDeletionOpHandle : public OpHandleBase { void RunImpl() override; private: - void ClearTensors(const std::vector &tensors); + void ClearGarbages(std::deque> *garbages); const Scope *scope_; std::unordered_set var_names_; - GarbageCollector *gc_; // not own + GarbageCollector *gc_; // not own AtomicReferenceCountMap *ref_cnts_; // not own #ifdef PADDLE_WITH_CUDA platform::CUDADeviceContext *dev_ctx_{nullptr}; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 85991c71e6..4e42d0b497 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -28,17 +28,21 @@ namespace details { std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - const auto &vars = graph->Get(kGraphVars); - auto &ref_cnts = Get>(kRuntimeReferenceCount); + PADDLE_ENFORCE(ref_cnts.empty(), + "kRuntimeReferenceCount should be initialized here!"); + + const auto &vars = graph->Get(kGraphVars); + ref_cnts.resize(vars.size()); + const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + const auto &gcs = Get(kGarbageCollector); const auto &places = Get>(kAllPlaces); - ref_cnts = std::vector(vars.size()); - + // a reverse map of last_live_ops + // i.e., last op --> variable names which can be deleted. std::unordered_map> op_vars_map; @@ -58,8 +62,8 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( auto *eager_deletion_node = graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), + eager_deletion_node, op->GetScope(), op->GetPlace(), var_names, + gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index b6b5ad42c4..d3865c2c29 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -42,6 +42,7 @@ void OpGraphView::Build(const std::vector &ops) { std::unordered_set OpGraphView::AllOps() const { std::unordered_set ret; + ret.reserve(preceding_ops_.size()); for (auto &pair : preceding_ops_) { ret.insert(pair.first); } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f2c9dfb524..13a042d8e6 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,15 +29,17 @@ namespace paddle { namespace framework { namespace details { -class OpRelationDetector { - public: +// A functor to shrink/remove operators who depend on other operators in a set +class ShrinkDepsOpFunctor { + private: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpRelationDetector(const std::vector &all_ops) + public: + explicit ShrinkDepsOpFunctor(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) const { + OpSet operator()(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of(ops[i])); + ret.emplace(static_cast(ops[i])); } } return ret; @@ -59,7 +61,7 @@ class OpRelationDetector { private: std::vector> GetRelations( - const std::vector ops) const { + const std::vector &ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -112,6 +114,10 @@ class OpRelationDetector { const OpGraphView graph_; }; +/** + * Find the nearest downstream computation op handle. If the op is a + * computation op, just return itself. + */ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -134,33 +140,87 @@ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( return nullptr; } +static std::unordered_set +ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, + const ShrinkDepsOpFunctor &shrink_func, + bool *ok) { + // stage one. Get last op for variable. + std::unordered_set candidates; + { + if (var->PendingOps().empty() && var->GeneratedOp()) { + // No operator depends on this variable. So the last operator is the op + // who generates this variable. + candidates.emplace(var->GeneratedOp()); + } else { + candidates = var->PendingOps(); + } + + // No pending ops or generated op is nullptr + if (candidates.empty()) { + *ok = false; + return {}; + } + } + + // stage two. Try to cast them to computation op. + // return (*ok=false) when failed. + // + // The reason why we cannot make any types of op handle to be the last lived + // op is: + // some op handle may operate on many DeviceContext, however, our garbage + // collector can only wait one DeviceContext for now. So currently, we wait + // the nearest compute op. + std::unordered_set computation_op; + { + for (auto *op : candidates) { + auto *compute_op = + FindNextComputationOpHandleOrReturnItself(op, scope_idx); + if (compute_op == nullptr) { + *ok = false; + return {}; + } + computation_op.emplace(compute_op); + } + } + + // stage three. Try to shrink computation op if they depend on each other. + // Get the smallest set of the most ops. + *ok = true; + return shrink_func(computation_op); +} + +static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kGlobalReferenceCount); auto &last_live_ops_of_vars = Get>(kLastLiveOpsOfVars); - last_live_ops_of_vars = std::vector(vars.size()); - ref_cnts = std::vector(vars.size()); + PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(), + "Last Live Ops and Reference Counts of vars should be " + "initialized at here."); - OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); + const auto &vars = graph->Get(kGraphVars); - for (size_t i = 0; i < vars.size(); ++i) { - for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) { - continue; - } + last_live_ops_of_vars.resize(vars.size()); + ref_cnts.resize(vars.size()); - const std::string &var_name = name_var_pair.first; - auto *last_ver_var = name_var_pair.second.back(); + ShrinkDepsOpFunctor shrink_func( + ir::FilterByNodeWrapper(*graph)); - VarDesc *var_desc = nullptr; - std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), - [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + // Whether this variable can be reused or deleted? If not, we do not + // compute reference counts and dependencies. + VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); if (var_desc == nullptr || var_desc->Persistable()) { continue; @@ -170,50 +230,20 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( if (var_type != proto::VarType::LOD_TENSOR && var_type != proto::VarType::SELECTED_ROWS && var_type != proto::VarType::LOD_TENSOR_ARRAY) { + // Var type cannot be deleted continue; } - std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) -> bool { - auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); - if (compute_op) { - last_live_op.insert(compute_op); - return true; - } else { - return false; - } - }; - - bool can_delete = false; - auto &pending_ops = last_ver_var->PendingOps(); - if (pending_ops.empty()) { - auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op && add_last_live_op(generated_op)) { - can_delete = true; - } - } else { - can_delete = true; - for (auto *pending_op : pending_ops) { - if (!add_last_live_op(pending_op)) { - can_delete = false; - break; - } - } - } - - if (can_delete) { - size_t original_size = last_live_op.size(); - last_live_op = detector.MaxNoDepOps(last_live_op); - if (last_live_op.size() != original_size) { - VLOG(10) << "Shrink last living op number of " << var_name << " from " - << original_size << " to " << last_live_op.size(); - } - - PADDLE_ENFORCE(!last_live_op.empty(), - "Last living ops of %s cannot be empty", var_name); + bool ok; + auto result = ExtractComputationOpFromLastLivedVar( + name_var_pair.second.back(), i, shrink_func, &ok); - ref_cnts[i].emplace(var_name, last_live_op.size()); - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (ok) { + auto &var_name = name_var_pair.first; + PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", + var_name); + ref_cnts[i].emplace(var_name, result.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(result)); } } } diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc new file mode 100644 index 0000000000..89bd08c2d0 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +namespace details {} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index eb534f9701..1c083dbf00 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -18,10 +18,10 @@ #include #include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace framework { @@ -35,7 +35,7 @@ using AtomicReferenceCountMap = std::unordered_map>; using GarbageCollectorMap = - std::map>>; + std::map>; const char kGlobalReferenceCount[] = "global_reference_count"; const char kRuntimeReferenceCount[] = "runtime_reference_count"; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index b8775fc329..57f6fc66c5 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -30,20 +30,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) { - if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); - } -} - -void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { - if (gc_) { - for (auto &gc_pair : *gc_) { - gc_pair.second->Wait(); - gc_pair.second->Reset(); - } - } -} + places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -83,19 +70,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto &p : places_) { + for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } - } else { - WaitAllGarbageCollectors(); } - if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 6086a219e0..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,11 +21,9 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" - namespace paddle { namespace framework { namespace details { @@ -50,8 +48,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: - void WaitAllGarbageCollectors(); - size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; @@ -59,8 +55,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; - - GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 04425a5983..767bbb524f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor.h" +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -83,31 +84,37 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { } static void DeleteUnusedTensors( - const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, std::unordered_map* ref_cnts) { - std::unordered_set erase_tensors; + std::deque> garbages; auto handler = [&](const VariableNameMap& name_map) { for (auto& name_pair : name_map) { for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if (--(it->second) == 0) { - auto* var = scope.FindVar(name); - if (var != nullptr) { - VLOG(2) << "Erase tensor \'" << name << "\'"; - if (var->IsType()) { - erase_tensors.insert(var->GetMutable()); - } else if (var->IsType()) { - erase_tensors.insert( - var->GetMutable()->mutable_value()); - } else if (var->IsType()) { - auto* lod_tensor_arr = var->GetMutable(); - for (auto& t : *lod_tensor_arr) { - erase_tensors.insert(&t); - } - } + if (--(it->second) != 0) { + continue; + } + auto* var = scope.FindVar(name); + if (var != nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemory()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } } @@ -116,8 +123,8 @@ static void DeleteUnusedTensors( handler(op->Inputs()); handler(op->Outputs()); - if (!erase_tensors.empty()) { - gc->Add(erase_tensors); + if (!garbages.empty()) { + gc->Add(std::move(garbages)); } } @@ -411,22 +418,22 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; + std::unique_ptr gc; if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( + gc.reset(new UnsafeFastGPUGarbageCollector( boost::get(place_), max_memory_size)); } else { - gc.reset(new DefaultStreamGarbageCollector( + gc.reset(new DefaultStreamGarbageCollector( boost::get(place_), max_memory_size)); } } else if (platform::is_cpu_place(place_)) { #endif - gc.reset(new CPUGarbageCollector( - boost::get(place_), max_memory_size)); + gc.reset(new CPUGarbageCollector(boost::get(place_), + max_memory_size)); #ifdef PADDLE_WITH_CUDA } #endif @@ -442,7 +449,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } platform::DeviceContextPool::Instance().Get(place_)->Wait(); - if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc new file mode 100644 index 0000000000..54d9d0dc01 --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +GarbageCollector::GarbageCollector(const platform::Place &place, + size_t max_memory_size) + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { + garbages_.reset(new GarbageQueue()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); +} + +CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} + +#ifdef PADDLE_WITH_CUDA +UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void UnsafeFastGPUGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void DefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void DefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); +} + +StreamGarbageCollector::~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +cudaStream_t StreamGarbageCollector::stream() const { return stream_; } + +void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void StreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 1382e0d461..2768671029 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -14,160 +14,83 @@ #pragma once -#include #include #include #include #include // NOLINT -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { -// T should have memory_size() and clear() method -template class GarbageCollector { public: - GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { - garbages_.reset(new std::deque()); - dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); - } + using GarbageQueue = std::deque>; - virtual ~GarbageCollector() {} + GarbageCollector(const platform::Place &place, size_t max_memory_size); - size_t NumOfGarbages() const { - std::lock_guard guard(mutex_); - return garbages_->size(); - } + virtual ~GarbageCollector() = default; - void Reset() { - std::lock_guard guard(mutex_); - garbages_.reset(new std::deque()); - cur_memory_size_ = 0; - } + virtual void Wait() const {} template - void Add(const Container &objs) { - Add(objs, []() {}); - } + void Add(Container &&objs); template - void Add(const Container &objs, Callback &&callback) { - std::deque *clear_deque = nullptr; - { - std::lock_guard guard(mutex_); - for (auto *obj : objs) { - garbages_->push_back(obj); - cur_memory_size_ += obj->memory_size(); - } - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - clear_deque = garbages_.release(); - garbages_.reset(new std::deque()); - } - } - - if (clear_deque != nullptr) { - callback(); - ClearCallback([clear_deque]() { - for (auto *obj : *clear_deque) obj->clear(); - delete clear_deque; - }); - } - } - - virtual void Wait() const {} + void Add(Container &&objs, Callback &&callback); protected: virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::unique_ptr> garbages_; + std::unique_ptr garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; - size_t cur_memory_size_ = 0; + size_t cur_memory_size_{0}; }; -template -class CPUGarbageCollector : public GarbageCollector { +class CPUGarbageCollector : public GarbageCollector { public: - CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; #ifdef PADDLE_WITH_CUDA -template -class UnsafeFastGPUGarbageCollector : public GarbageCollector { +class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; -template -class DefaultStreamGarbageCollector : public GarbageCollector { +class DefaultStreamGarbageCollector : public GarbageCollector { public: DefaultStreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); - cudaStream_t stream() const { - return static_cast(this->dev_ctx_) - ->stream(); - } - - void Wait() const override { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); - } + void Wait() const override; protected: - void ClearCallback(const std::function &callback) override { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); - } + void ClearCallback(const std::function &callback) override; }; -template -class StreamGarbageCollector : public GarbageCollector { +class StreamGarbageCollector : public GarbageCollector { public: StreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) { - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamCreate(&stream_)); - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); - } + size_t max_memory_size); - ~StreamGarbageCollector() { - auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaStreamDestroy(stream_)); - } + ~StreamGarbageCollector(); - void Wait() const override { callback_manager_->Wait(); } + void Wait() const override; - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const; protected: - void ClearCallback(const std::function &callback) override { - callback_manager_->AddCallback(callback); - } + void ClearCallback(const std::function &callback) override; private: cudaStream_t stream_; @@ -175,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector { }; #endif +template +void GarbageCollector::Add(Container &&objs) { + Add(std::forward(objs), []() {}); +} + +template +void GarbageCollector::Add(Container &&objs, Callback &&callback) { + GarbageQueue *garbage_queue = nullptr; + { + std::lock_guard guard(mutex_); + for (auto &obj : objs) { + if (!obj) continue; + cur_memory_size_ += obj->size(); + garbages_->push_back(std::move(obj)); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + garbage_queue = garbages_.release(); + garbages_.reset(new GarbageQueue()); + } + } + + if (garbage_queue) { + callback(); + ClearCallback([garbage_queue]() { delete garbage_queue; }); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e51b1f1f73..7458b69af8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,29 +97,31 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } - GarbageCollector *gc = nullptr; + std::unique_ptr gc; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { - gc = new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); } else { - gc = new StreamGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new StreamGarbageCollector( + boost::get(place), max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { + } else { #endif - gc = new CPUGarbageCollector( - boost::get(place), max_memory_size); - VLOG(10) << "Created GarbageCollector at " << place; + if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector(boost::get(place), + max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW("Unsupported place for garbage collection"); + } #ifdef PADDLE_WITH_CUDA } #endif - if (gc) { - gcs_[place] = std::unique_ptr>(gc); - } + gcs_.emplace(place, std::move(gc)); } if (!gcs_.empty()) { @@ -144,8 +146,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &gcs_); } return graph; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index cb3b6cdc3e..6fa5e99f9f 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,7 +38,7 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); -DEFINE_bool(fast_eager_deletion_mode, true, +DEFINE_bool(fast_eager_deletion_mode, false, "Fast eager deletion mode. If enabled, memory would release " "immediately without waiting GPU kernel ends."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b..9f7027f5ae 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,6 +158,10 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } + std::shared_ptr MoveMemory() { + return std::move(holder_); + } + private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py new file mode 100644 index 0000000000..1ec174544c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -0,0 +1,49 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_eager_deletion_lstm_net import TestBase +import paddle.fluid as fluid + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class GRUTest(TestBase): + def setUp(self): + self.net = gru_net + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py new file mode 100644 index 0000000000..431765bff2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -0,0 +1,111 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = lstm_net + + def test_network(self): + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) + + +if __name__ == "__main__": + unittest.main() From 2c6159a151d573ca697e2dfd591720cc854b4b9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 13:59:36 +0000 Subject: [PATCH 11/14] fix unittest fix cmake test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- .../test_eager_deletion_dynamic_rnn_base.py | 86 +++++++++++++++++++ .../unittests/test_eager_deletion_gru_net.py | 2 +- .../unittests/test_eager_deletion_lstm_net.py | 67 +-------------- 4 files changed, 92 insertions(+), 67 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f2361c5cea..b236eef3ce 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -171,9 +171,9 @@ if(WITH_DISTRIBUTE) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper garbage_collector) else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper garbage_collector) endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py new file mode 100644 index 0000000000..e91cfe0b45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = None + + def test_network(self): + if self.net is None: + return + + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index 1ec174544c..5ed3d9fdf3 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -13,7 +13,7 @@ # limitations under the License. import unittest -from test_eager_deletion_lstm_net import TestBase +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 431765bff2..8462c06aa5 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -12,60 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' -os.environ['CPU_NUM'] = '2' - -import six -import unittest - -import paddle -import paddle.fluid.core as core +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid - - -def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): - if use_cuda and not core.is_compiled_with_cuda(): - print('Skip use_cuda=True because Paddle is not compiled with cuda') - return - - word_dict = paddle.dataset.imdb.word_dict() - train_reader = paddle.batch( - paddle.dataset.imdb.train(word_dict), batch_size=batch_size) - - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - cost = network(data, label, len(word_dict)) - optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) - optimizer.minimize(cost) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - feeder = fluid.DataFeeder(feed_list=[data, label], place=place) - reader = feeder.decorate_reader( - train_reader, multi_devices=use_parallel_executor) - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - if use_parallel_executor: - train_exe = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=cost.name) - fetch_list = [cost.name] - else: - train_exe = exe - fetch_list = [cost] - - for pass_id in six.moves.xrange(pass_num): - batch_id = 0 - for data in reader(): - train_exe.run(feed=data, - fetch_list=fetch_list if batch_id % 4 == 0 else []) - batch_id += 1 - if batch_id > 16: - break +import unittest def lstm_net(data, @@ -92,20 +41,10 @@ def lstm_net(data, return avg_cost -class TestBase(unittest.TestCase): +class LSTMTest(TestBase): def setUp(self): self.net = lstm_net - def test_network(self): - for use_cuda in [True, False]: - for use_parallel_executor in [False, True]: - print('network: {}, use_cuda: {}, use_parallel_executor: {}'. - format(self.net.__name__, use_cuda, - use_parallel_executor)) - with fluid.program_guard(fluid.Program(), fluid.Program()): - with fluid.scope_guard(core.Scope()): - train(self.net, use_cuda, use_parallel_executor) - if __name__ == "__main__": unittest.main() From 06f8aa5b97be564b878848acd216069e23081300 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 03:08:21 +0000 Subject: [PATCH 12/14] remove while_op support temporarily test=develop --- paddle/fluid/framework/executor.cc | 3 +- .../fluid/operators/controlflow/while_op.cc | 46 +------------------ 2 files changed, 3 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 767bbb524f..7eab876015 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -419,7 +419,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - if (max_memory_size >= 0) { + // skip while_op and while_grad_op temporarily + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 06920a47ee..5ab0918c48 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -365,51 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ - std::unordered_set bwd_skip_vars; - if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set fwd_skip_vars; - for (auto *op_desc : grad_block->AllOps()) { - auto skippable = [&](const std::string &name) { - return !grad_block->HasVar(name) && - (fwd_block->HasVarRecursive(name) || - parent_block->HasVarRecursive(name)); - }; - for (auto &in_arg_name : op_desc->InputArgumentNames()) { - if (skippable(in_arg_name)) { - fwd_skip_vars.insert(in_arg_name); - } - } - - for (auto &out_arg_name : op_desc->OutputArgumentNames()) { - if (skippable(out_arg_name)) { - fwd_skip_vars.insert(out_arg_name); - } - } - } - - if (!fwd_skip_vars.empty()) { - // FIXME(zjl): ugly const_cast here, maybe we should find a better way - // to modify forward while_op - auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr(kSkipEagerDeletionVars, - std::vector(fwd_skip_vars.begin(), - fwd_skip_vars.end())); - } - - // Find backward skip vars - auto fwd_input = Input(kX); - for (size_t i = 0; i < igs.size(); ++i) { - if (igs[i] == framework::kEmptyVarName) { - continue; - } - bwd_skip_vars.insert(igs[i]); - bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); - } - } - while_grad->SetAttr( - kSkipEagerDeletionVars, - std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); + while_grad->SetAttr(kSkipEagerDeletionVars, std::vector()); return std::unique_ptr(while_grad); } From e82772f42518f1cff790ac04aa1c73c2e5b201e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 09:22:44 +0000 Subject: [PATCH 13/14] fix cmake conflict test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b1cfb23f3a..6d7a69c8c9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -169,7 +169,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() From 2328bee1cc835d789b83cd4da9bef6b588bc87c5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 13 Dec 2018 06:34:09 +0000 Subject: [PATCH 14/14] fix Windows compile bug test=develop --- .../framework/details/eager_deletion_op_handle.cc | 6 +++--- paddle/fluid/framework/executor.cc | 10 ++++++---- paddle/fluid/framework/tensor.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 3b27415e43..abacb11e3b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -77,14 +77,14 @@ void EagerDeletionOpHandle::RunImpl() { VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->mutable_value()->MoveMemoryHolder()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 16c4552a5f..0c4bd336c5 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -106,14 +106,16 @@ static void DeleteUnusedTensors( VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); - } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto* lod_tensor_arr = var->GetMutable(); for (auto& t : *lod_tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 9f7027f5ae..153222506a 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,7 +158,7 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } - std::shared_ptr MoveMemory() { + std::shared_ptr MoveMemoryHolder() { return std::move(holder_); }