Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/tensor_type

test=develop
7 years ago · 4ecdb6f486
parent aa38fc4ce5 5384206aec
commit 4ecdb6f486
50 changed files with 2325 additions and 640 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -198,6 +198,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
@ -183,6 +185,8 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 target_link_libraries(executor garbage_collector)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -45,10 +45,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-if (WITH_GPU)
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
-endif()
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@ -56,10 +56,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
                                         size_t scope_idx)
    : OpHandleBase(node),
      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
-      place_(place) {}
+      place_(place),
      scope_idx_(scope_idx) {}
 void ComputationOpHandle::RunImpl() {
  WaitInputVarGenerated(place_);
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                      size_t scope_idx);
  std::string Name() const override;
@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {
  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
  size_t GetScopeIdx() const { return scope_idx_; }
 protected:
  void RunImpl() override;
@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
  std::unique_ptr<OperatorBase> op_;
  Scope *scope_;
  platform::Place place_;
  size_t scope_idx_;
  bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -0,0 +1,122 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 namespace paddle {
 namespace framework {
 namespace details {
 EagerDeletionOpHandle::EagerDeletionOpHandle(
    ir::Node *node, const Scope *scope, const platform::Place &place,
    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
    AtomicReferenceCountMap *ref_cnts)
    : OpHandleBase(node),
      scope_(scope),
      var_names_(var_names),
      gc_(gc),
      ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(place)) {
    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
      platform::CUDADeviceGuard guard(
          boost::get<platform::CUDAPlace>(place).device);
      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
      PADDLE_ENFORCE_NOT_NULL(event_);
    }
  }
 #endif
 }
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 #ifdef PADDLE_WITH_CUDA
  if (event_) {
    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
    platform::CUDADeviceGuard guard(gpu_place.device);
    PADDLE_ENFORCE(cudaEventDestroy(event_));
  }
 #endif
 }
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 void EagerDeletionOpHandle::RunImpl() {
  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
  std::deque<std::shared_ptr<memory::Allocation>> garbages;
  for (auto &name : var_names_) {
    auto it = ref_cnts_->find(name);
    // Var not found, not reference count has not decreased to 0
    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
      continue;
    }
    auto *var = exec_scope->FindVar(name);
    if (var == nullptr) {
      continue;
    }
    VLOG(2) << "Erase variable " << name;
    if (var->IsType<LoDTensor>()) {
      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
    } else if (var->IsType<SelectedRows>()) {
      garbages.emplace_back(
          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
    } else if (var->IsType<LoDTensorArray>()) {
      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto &t : *tensor_arr) {
        garbages.emplace_back(t.MoveMemoryHolder());
      }
    } else {
      PADDLE_THROW("Type %s of %s is not supported eager deletion",
                   var->Type().name(), name);
    }
  }
  if (!garbages.empty()) {
    ClearGarbages(&garbages);
  }
 }
 void EagerDeletionOpHandle::ClearGarbages(
    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
 #ifdef PADDLE_WITH_CUDA
  if (event_) {
    auto compute_stream = dev_ctx_->stream();
    auto callback_stream =
        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
    auto callback_func = [=]() {
      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
    };
    gc_->Add(std::move(*garbages), callback_func);
  } else {
 #endif
    gc_->Add(std::move(*garbages));
 #ifdef PADDLE_WITH_CUDA
  }
 #endif
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@ -0,0 +1,58 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <deque>
 #include <string>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 namespace paddle {
 namespace framework {
 class Scope;
 namespace details {
 class EagerDeletionOpHandle : public OpHandleBase {
 public:
  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
                        const platform::Place &place,
                        const std::unordered_set<std::string> &var_names,
                        GarbageCollector *gc,
                        AtomicReferenceCountMap *ref_cnts);
  ~EagerDeletionOpHandle();
  std::string Name() const override;
 protected:
  void RunImpl() override;
 private:
  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
  const Scope *scope_;
  std::unordered_set<std::string> var_names_;
  GarbageCollector *gc_;               // not own
  AtomicReferenceCountMap *ref_cnts_;  // not own
 #ifdef PADDLE_WITH_CUDA
  platform::CUDADeviceContext *dev_ctx_{nullptr};
  cudaEvent_t event_{nullptr};
 #endif
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@ -0,0 +1,101 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <queue>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 namespace paddle {
 namespace framework {
 namespace details {
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts =
      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
  PADDLE_ENFORCE(ref_cnts.empty(),
                 "kRuntimeReferenceCount should be initialized here!");
  const auto &vars = graph->Get<GraphVars>(kGraphVars);
  ref_cnts.resize(vars.size());
  const auto &last_live_ops =
      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
  // a reverse map of last_live_ops
  //   i.e., last op --> variable names which can be deleted.
  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
      op_vars_map;
  for (auto &var_ops_map : last_live_ops) {
    for (auto &var_ops_pair : var_ops_map) {
      const std::string &var_name = var_ops_pair.first;
      for (auto *op : var_ops_pair.second) {
        op_vars_map[op].insert(var_name);
      }
    }
  }
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
    auto &var_names = pair.second;
    auto *eager_deletion_node =
        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
    auto *eager_deletion_op = new EagerDeletionOpHandle(
        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
        gcs.at(places[op->GetScopeIdx()]).get(),
        &(ref_cnts[op->GetScopeIdx()]));
    auto it = std::find_if(
        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
        });
    if (it != op->Outputs().end()) {
      eager_deletion_op->AddInput(*it);
    } else {
      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
      op->AddOutput(dep_var);
      eager_deletion_op->AddInput(dep_var);
    }
    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
    eager_deletion_op->AddOutput(dummy_leaf);
  }
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
  return graph;
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(eager_deletion_pass,
              paddle::framework::details::EagerDeletionPass)
    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
    .RequirePassAttr(paddle::framework::details::kAllPlaces)
    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
@ -0,0 +1,32 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class EagerDeletionPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -565,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                    int dev_id) const {
  result->Get<GraphOps>(kGraphOps).emplace_back(
      new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
  CreateOpHandleIOs(result, node, dev_id);
 }
@ -688,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
    CreateOpHandleIOs(result, node, scope_idx);
  }
 }
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }
 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
  preceding_ops_.clear();
  pending_ops_.clear();
  for (auto &op : ops) {
    preceding_ops_[op];
    pending_ops_[op];
@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
  std::unordered_set<OpHandleBase *> ret;
  ret.reserve(preceding_ops_.size());
  for (auto &pair : preceding_ops_) {
    ret.insert(pair.first);
  }
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@ -14,7 +14,7 @@
 #pragma once
-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@ -34,6 +34,11 @@ class OpGraphView {
  bool HasOp(OpHandleBase *op) const;
  // Use a visitor to visit all pending ops of op
  // Stop when callback returns false
  template <typename Callback>
  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
 private:
  void Build(const std::vector<OpHandleBase *> &ops);
  void EnforceHasOp(OpHandleBase *op) const;
@ -44,6 +49,28 @@ class OpGraphView {
      pending_ops_;
 };
 template <typename Callback>
 bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
                                     Callback &&callback) const {
  EnforceHasOp(op);
  std::unordered_set<OpHandleBase *> visited;
  std::queue<OpHandleBase *> q;
  q.push(op);
  do {
    op = q.front();
    q.pop();
    for (auto &pending_op : pending_ops_.at(op)) {
      if (visited.count(pending_op) == 0) {
        visited.insert(pending_op);
        if (!callback(pending_op)) {
          return false;
        }
      }
    }
  } while (!q.empty());
  return true;
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
@ -1,138 +0,0 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
 namespace paddle {
 namespace framework {
 namespace details {
 using ReferenceCountMap = std::unordered_map<std::string, int>;
 using AtomicReferenceCountMap =
    std::unordered_map<std::string, std::atomic<int>>;
 using DeviceReferenceCountMap =
    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
 using AtomicDeviceReferenceCountMap =
    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
 using DeviceGarbageCollectorMap =
    std::unordered_map<int,
                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
 class ReferenceCountOpHandle : public OpHandleBase {
 public:
  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
                         const platform::CUDAPlace &place,
                         const std::vector<std::string> &var_names,
                         GarbageCollector<Tensor> *gc,
                         AtomicReferenceCountMap *ref_cnts)
      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (IsStreamGarabageCollector()) {
      platform::SetDeviceId(place.device);
      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
    }
    for (auto &name : var_names) AddVar(name);
  }
  ~ReferenceCountOpHandle() {
    if (IsStreamGarabageCollector()) {
      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
      platform::SetDeviceId(gpu_place.device);
      PADDLE_ENFORCE(cudaEventDestroy(event_));
    }
  }
  std::string Name() const override { return "reference_count"; }
  void AddVar(const std::string &name) {
    auto it = var_names_.find(name);
    if (it != var_names_.end())
      ++(it->second);
    else
      var_names_[name] = 1;
  }
 protected:
  void RunImpl() override {
    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
    std::vector<Tensor *> tensors;
    for (auto &pair : var_names_) {
      auto &name = pair.first;
      auto it = ref_cnts_->find(name);
      if (it == ref_cnts_->end()) continue;
      auto *var = exec_scope->FindVar(name);
      if (var == nullptr) continue;
      if (var->IsType<LoDTensor>()) {
        if (it->second.fetch_sub(pair.second) <= pair.second) {
          tensors.emplace_back(var->GetMutable<LoDTensor>());
        }
      } else if (var->IsType<SelectedRows>()) {
        if (it->second.fetch_sub(pair.second) <= pair.second) {
          tensors.emplace_back(
              var->GetMutable<SelectedRows>()->mutable_value());
        }
      }
    }
    if (!tensors.empty()) {
      ClearTensors(tensors);
    }
  }
 private:
  void ClearTensors(const std::vector<Tensor *> &tensors) {
    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
    if (gc != nullptr) {
      auto compute_stream = dev_ctx_->stream();
      auto callback_stream = gc->stream();
      auto callback_func = [=]() {
        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
      };
      gc_->Add(tensors, callback_func);
    } else {
      gc_->Add(tensors);
    }
  }
  bool IsStreamGarabageCollector() const {
    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
  }
  const Scope *scope_;
  platform::CUDADeviceContext *dev_ctx_;
  std::unordered_map<std::string, int> var_names_;
  GarbageCollector<Tensor> *gc_;       // not own
  AtomicReferenceCountMap *ref_cnts_;  // not own
  cudaEvent_t event_;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@ -14,7 +14,6 @@
 #pragma once
 #include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 constexpr char kGlobalReferenceCount[] = "reference_count";
 constexpr char kCurReferenceCount[] = "current_reference_count";
 constexpr char kGarbageCollector[] = "garbage_collector";
 class ReferenceCountPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@ -0,0 +1,21 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 namespace paddle {
 namespace framework {
 namespace details {}  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@ -0,0 +1,51 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <atomic>
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class ComputationOpHandle;
 using ReferenceCountMap = std::unordered_map<std::string, size_t>;
 using AtomicReferenceCountMap =
    std::unordered_map<std::string, std::atomic<size_t>>;
 using GarbageCollectorMap =
    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
 const char kGlobalReferenceCount[] = "global_reference_count";
 const char kRuntimeReferenceCount[] = "runtime_reference_count";
 const char kGarbageCollector[] = "garbage_collector";
 const char kAllPlaces[] = "all_places";
 using LastLiveOpsOfVars =
    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
 const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -18,9 +18,6 @@
 #include <vector>
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #endif
 namespace paddle {
 namespace framework {
@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;
 #ifdef PADDLE_WITH_CUDA
  const std::string gc_name = "garbage_collector";
  DeviceGarbageCollectorMap *gc =
      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
                           : nullptr;
 #endif
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
 #ifdef PADDLE_WITH_CUDA
      if (gc != nullptr && platform::is_gpu_place(p)) {
        auto gpu_place = boost::get<platform::CUDAPlace>(p);
        auto &gc_at_place = gc->at(gpu_place.device);
        gc_at_place->Wait();
        gc_at_place->Reset();
      }
 #endif
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include <deque>
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@ -41,11 +42,43 @@ namespace {
 int kProgramId = -1;
 }  // namespace
 static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
  std::unordered_map<std::string, size_t> ref_cnts;
  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
                                            skip_var_list.end());
  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        if (skip_vars.count(name)) continue;
        auto* var_desc = block.FindVar(name);
        if (var_desc == nullptr || var_desc->Persistable()) continue;
        auto type = var_desc->Proto()->type().type();
        if (type != proto::VarType::LOD_TENSOR &&
            type != proto::VarType::SELECTED_ROWS &&
            type != proto::VarType::LOD_TENSOR_ARRAY) {
          continue;
        }
        ++ref_cnts[name];
      }
    }
  };
  for (auto op_desc : block.AllOps()) {
    update_ref_cnts(op_desc, op_desc->Inputs());
    update_ref_cnts(op_desc, op_desc->Outputs());
  }
  return ref_cnts;
 }
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
    const std::vector<std::string>& skip_ref_cnt_vars)
    : prog_(prog), block_id_(block_id) {
  if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
                                                        skip_ref_cnt_vars);
  }
 }
@ -53,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
-template <typename RefCntMap>
+static void DeleteUnusedTensors(
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
-                                GarbageCollector<Tensor>* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
-                                RefCntMap* ref_cnts) {
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
  std::unordered_set<Tensor*> erase_tensors;
  auto handler = [&](const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        auto it = ref_cnts->find(name);
        if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
+        if (--(it->second) != 0) {
          continue;
        }
        auto* var = scope.FindVar(name);
        if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+          continue;
        }
        VLOG(2) << "Erase variable " << name;
        if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
+          garbages.emplace_back(
              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
        } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
-                  var->GetMutable<SelectedRows>()->mutable_value());
+                                    ->mutable_value()
-            }
+                                    ->MoveMemoryHolder());
        } else if (var->IsType<LoDTensorArray>()) {
          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
          for (auto& t : *lod_tensor_arr) {
            garbages.emplace_back(t.MoveMemoryHolder());
          }
        } else {
          PADDLE_THROW("Type %s of %s is not supported eager deletion",
                       var->Type().name(), name);
        }
      }
    }
@ -83,8 +128,8 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  handler(op->Inputs());
  handler(op->Outputs());
-  if (!erase_tensors.empty()) {
+  if (!garbages.empty()) {
-    gc->Add(erase_tensors);
+    gc->Add(std::move(garbages));
  }
 }
@ -325,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
    const std::vector<std::string>& skip_ref_cnt_vars) {
  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@ -338,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
  PADDLE_ENFORCE(
      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
      "skip_ref_cnt_vars should be either empty or equals to block number %d",
      block_ids.size());
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
  size_t idx = 0;
  for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
    if (skip_ref_cnt_vars.empty()) {
      ctx = new ExecutorPrepareContext(program, bid);
    } else {
      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
    ++idx;
  }
  return result;
 }
@ -365,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
+  std::unique_ptr<GarbageCollector> gc;
-  // WhileOp would set keep_kids to true,
+  // skip while_op and while_grad_op temporarily
  // because WhileGradOp needs the scopes created in WhileOp.
  // Perhaps, we should not perform eager deletion in WhileOp
  // The scopes and variables created by WhileOp would be deleted
  // in WhileGradOp.
  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            boost::get<platform::CUDAPlace>(place_), max_memory_size));
      } else {
        gc.reset(new DefaultStreamGarbageCollector(
            boost::get<platform::CUDAPlace>(place_), max_memory_size));
      }
    } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
    }
 #endif
@ -389,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);
-    if (gc != nullptr) {
+    if (gc) {
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
    }
  }
  if (gc != nullptr) {
    gc->Wait();
  } else {
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }
  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -27,52 +27,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
    const ProgramDesc& prog, size_t block_id) {
  auto& block = prog.Block(block_id);
  std::unordered_map<std::string, T> ref_cnts;
  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        auto* var_desc = block.FindVar(name);
        if (var_desc == nullptr || var_desc->Persistable()) continue;
        auto type = var_desc->Proto()->type().type();
        if (type != proto::VarType::LOD_TENSOR &&
            type != proto::VarType::SELECTED_ROWS) {
          continue;
        }
        auto it = ref_cnts.find(name);
        if (it != ref_cnts.end()) {
          ++it->second;
        } else {
          ref_cnts[name] = 1;
        }
      }
    }
  };
  for (auto op_desc : block.AllOps()) {
    update_ref_cnts(op_desc, op_desc->Inputs());
    update_ref_cnts(op_desc, op_desc->Outputs());
  }
  return ref_cnts;
 }
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
                         const std::vector<std::string>& skip_ref_cnt_vars =
                             std::vector<std::string>());
  ~ExecutorPrepareContext();
-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
  const framework::ProgramDesc& prog_;
  size_t block_id_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
-  std::unordered_map<std::string, int> ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };
 class Executor {
@ -108,10 +77,14 @@ class Executor {
           const std::string& fetch_holder_name = "fetch");
  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
      const std::vector<std::string>& skip_ref_cnt_vars =
          std::vector<std::string>());
  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
          std::vector<std::vector<std::string>>());
  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@ -0,0 +1,89 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <algorithm>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {
 namespace framework {
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                   size_t max_memory_size)
    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
  garbages_.reset(new GarbageQueue());
  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
 }
 CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
                                         size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {}
 void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
  callback();
 }
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {}
 void UnsafeFastGPUGarbageCollector::ClearCallback(
    const std::function<void()> &callback) {
  callback();
 }
 DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {}
 void DefaultStreamGarbageCollector::Wait() const {
  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
      ->WaitStreamCallback();
 }
 void DefaultStreamGarbageCollector::ClearCallback(
    const std::function<void()> &callback) {
  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
      ->AddStreamCallback(callback);
 }
 StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
                                               size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {
  platform::CUDADeviceGuard guard(place.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
 }
 StreamGarbageCollector::~StreamGarbageCollector() {
  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
  platform::CUDADeviceGuard guard(place.device);
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 }
 cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
 void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
 void StreamGarbageCollector::ClearCallback(
    const std::function<void()> &callback) {
  callback_manager_->AddCallback(callback);
 }
 #endif
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@ -14,7 +14,6 @@
 #pragma once
 #include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
@ -24,134 +23,74 @@
 namespace paddle {
 namespace framework {
 // T should have memory_size() and clear() method
 template <typename T>
 class GarbageCollector {
 public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
    garbages_.reset(new std::deque<T *>());
    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
  }
-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);
-  void Reset() {
+  virtual ~GarbageCollector() = default;
-    std::lock_guard<std::mutex> guard(mutex_);
+
-    garbages_.reset(new std::deque<T *>());
+  virtual void Wait() const {}
    cur_memory_size_ = 0;
  }
  template <typename Container>
-  void Add(const Container &objs) {
+  void Add(Container &&objs);
    Add(objs, []() {});
  }
  template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
+  void Add(Container &&objs, Callback &&callback);
    std::shared_ptr<std::deque<T *>> clear_deque;
    {
      std::lock_guard<std::mutex> guard(mutex_);
      for (auto *obj : objs) {
        garbages_->push_back(obj);
        cur_memory_size_ += obj->memory_size();
      }
      if (cur_memory_size_ >= max_memory_size_) {
        cur_memory_size_ = 0;
        clear_deque = garbages_;
        garbages_.reset(new std::deque<T *>());
      }
    }
    if (clear_deque != nullptr) {
      callback();
      ClearCallback([=]() {
        for (auto *obj : *clear_deque) obj->clear();
      });
    }
  }
  virtual void Wait() const {}
 protected:
  virtual void ClearCallback(const std::function<void()> &callback) = 0;
  platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
  mutable std::mutex mutex_;
  const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };
-template <typename T>
+class CPUGarbageCollector : public GarbageCollector {
 class CPUGarbageCollector : public GarbageCollector<T> {
 public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);
      : GarbageCollector<T>(place, max_memory_size) {}
 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
+  void ClearCallback(const std::function<void()> &callback) override;
    callback();
  }
 };
 #ifdef PADDLE_WITH_CUDA
-template <typename T>
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 class DefaultStreamGarbageCollector : public GarbageCollector<T> {
 public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
+                                size_t max_memory_size);
      : GarbageCollector<T>(place, max_memory_size) {}
-  cudaStream_t stream() const {
+ protected:
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+  void ClearCallback(const std::function<void()> &callback) override;
-        ->stream();
+};
  }
-  void Wait() const override {
+class DefaultStreamGarbageCollector : public GarbageCollector {
-    this->dev_ctx_->Wait();
+ public:
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-        ->WaitStreamCallback();
+                                size_t max_memory_size);
-  }
+
  void Wait() const override;
 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
+  void ClearCallback(const std::function<void()> &callback) override;
    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
        ->AddStreamCallback(callback);
  }
 };
-template <typename T>
+class StreamGarbageCollector : public GarbageCollector {
 class StreamGarbageCollector : public GarbageCollector<T> {
 public:
  StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
+                         size_t max_memory_size);
      : GarbageCollector<T>(place, max_memory_size) {
    PADDLE_ENFORCE(cudaSetDevice(place.device));
    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
  }
-  ~StreamGarbageCollector() {
+  ~StreamGarbageCollector();
    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
    PADDLE_ENFORCE(cudaSetDevice(place.device));
    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
  }
-  void Wait() const override {
+  void Wait() const override;
    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
    std::lock_guard<std::mutex> guard(this->mutex_);
    callback_manager_->Wait();
  }
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;
 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
+  void ClearCallback(const std::function<void()> &callback) override;
    std::lock_guard<std::mutex> guard(this->mutex_);
    callback_manager_->AddCallback(callback);
  }
 private:
  cudaStream_t stream_;
@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
  Add(std::forward<Container>(objs), []() {});
 }
 template <typename Container, typename Callback>
 void GarbageCollector::Add(Container &&objs, Callback &&callback) {
  GarbageQueue *garbage_queue = nullptr;
  {
    std::lock_guard<std::mutex> guard(mutex_);
    for (auto &obj : objs) {
      if (!obj) continue;
      cur_memory_size_ += obj->size();
      garbages_->push_back(std::move(obj));
    }
    if (cur_memory_size_ >= max_memory_size_) {
      cur_memory_size_ = 0;
      garbage_queue = garbages_.release();
      garbages_.reset(new GarbageQueue());
    }
  }
  if (garbage_queue) {
    callback();
    ClearCallback([garbage_queue]() { delete garbage_queue; });
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -73,14 +73,21 @@ class Graph {
  }
  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }
  template <typename AttrType>
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                   attr_name);
    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
    } catch (boost::bad_any_cast &) {
      PADDLE_THROW(
          "Invalid attribute type of %s error, expected: %s, actual: %s",
          attr_name, typeid(AttrType *).name(),
          attrs_.at(attr_name).type().name());
    }
  }
  template <typename AttrType>
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@ -51,11 +51,18 @@ class Pass {
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                   "%s attr not registered for pass.", attr_name);
    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
    } catch (boost::bad_any_cast &) {
      PADDLE_THROW(
          "Invalid attribute type of %s error, expected: %s, actual: %s",
          attr_name, typeid(AttrType *).name(),
          attrs_.at(attr_name).type().name());
    }
  }
  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }
  void Erase(const std::string &attr_name) {
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -879,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
                         ipt_name, DebugString());
          int tmp = static_cast<int>(t->type());
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,
--- a/Show More
+++ b/Show More