From 096673f67527b0fed1aab1843041b9d929fd0fb5 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 29 Nov 2018 13:20:29 +0000
Subject: [PATCH 01/14] refactor eager deletion test=develop

---
 paddle/fluid/framework/details/CMakeLists.txt |  12 +-
 .../details/computation_op_handle.cc          |   6 +-
 .../framework/details/computation_op_handle.h |   6 +-
 .../details/eager_deletion_op_handle.cc       | 117 ++++++++++
 .../details/eager_deletion_op_handle.h        |  64 ++++++
 .../framework/details/eager_deletion_pass.cc  |  96 ++++++++
 .../framework/details/eager_deletion_pass.h   |  32 +++
 .../details/multi_devices_graph_pass.cc       |   6 +-
 .../details/reference_count_op_handle.h       | 138 ------------
 .../framework/details/reference_count_pass.cc | 213 +++++-------------
 .../framework/details/reference_count_pass.h  |   5 -
 .../details/reference_count_pass_helper.h     |  49 ++++
 .../scope_buffered_ssa_graph_executor.cc      |  30 +--
 .../scope_buffered_ssa_graph_executor.h       |   4 +
 paddle/fluid/framework/garbage_collector.h    |  12 +-
 paddle/fluid/framework/ir/graph.h             |  11 +-
 paddle/fluid/framework/ir/pass.h              |  11 +-
 paddle/fluid/framework/parallel_executor.cc   | 106 ++++++---
 paddle/fluid/framework/parallel_executor.h    |  24 +-
 paddle/fluid/platform/CMakeLists.txt          |   9 +-
 .../fluid/platform/stream_callback_manager.cc |  70 ++++++
 .../fluid/platform/stream_callback_manager.h  |  51 +----
 22 files changed, 631 insertions(+), 441 deletions(-)
 create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h
 create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.cc
 create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h
 delete mode 100644 paddle/fluid/framework/details/reference_count_op_handle.h
 create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.h
 create mode 100644 paddle/fluid/platform/stream_callback_manager.cc

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 93288936fe..8cf97d667d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -33,10 +33,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 
-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@@ -44,10 +43,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ad1e40c60..7beb8c8de9 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
     : OpHandleBase(node),
       op_(framework::OpRegistry::CreateOp(*node->Op())),
       scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}
 
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 662a91d6b4..601ae4f8c6 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
  public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);
 
   std::string Name() const override;
 
@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {
 
   void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
 
+  size_t GetScopeIdx() const { return scope_idx_; }
+
  protected:
   void RunImpl() override;
 
@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   Scope *scope_;
   platform::Place place_;
+  size_t scope_idx_;
   bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
new file mode 100644
index 0000000000..cd26203376
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::vector<std::string> &var_names, GarbageCollector<Tensor> *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
+      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+  }
+#endif
+
+  for (auto &name : var_names) AddVar(name);
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::SetDeviceId(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::AddVar(const std::string &name) {
+  var_names_.insert(name);
+}
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::vector<Tensor *> tensors;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    if (it == ref_cnts_->end()) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    if (var->IsType<LoDTensor>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>());
+      }
+    } else if (var->IsType<SelectedRows>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        tensors.emplace_back(var->GetMutable<SelectedRows>()->mutable_value());
+      }
+    } else if (var->IsType<LoDTensorArray>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+        for (auto &t : *tensor_arr) {
+          tensors.emplace_back(&t);
+        }
+      }
+    }
+  }
+
+  if (!tensors.empty()) {
+    ClearTensors(tensors);
+  }
+}
+
+void EagerDeletionOpHandle::ClearTensors(const std::vector<Tensor *> &tensors) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        static_cast<StreamGarbageCollector<Tensor> *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(tensors, callback_func);
+  } else {
+#endif
+    gc_->Add(tensors);
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
new file mode 100644
index 0000000000..8254f21bdf
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionPass;
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::vector<std::string> &var_names,
+                        GarbageCollector<Tensor> *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearTensors(const std::vector<Tensor *> &tensors);
+
+  void AddVar(const std::string &name);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector<Tensor> *gc_;       // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+
+  friend class EagerDeletionPass;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
new file mode 100644
index 0000000000..f877c2881c
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
+                                 ir::Graph *graph) {
+  auto it = std::find_if(
+      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
+        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+      });
+
+  if (it != in->Outputs().end()) {
+    out->AddInput(*it);
+  } else {
+    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+    in->AddOutput(dep_var);
+    out->AddInput(dep_var);
+  }
+
+  // Add leaf node to eager_deletion_node
+  if (out->Outputs().empty()) {
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    out->AddOutput(dummy_leaf);
+  }
+}
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &vars = graph->Get<GraphVars>(kGraphVars);
+
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
+  auto &last_live_ops = Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
+
+  ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
+
+  std::unordered_map<ComputationOpHandle *, EagerDeletionOpHandle *> op_map;
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (ComputationOpHandle *op : var_ops_pair.second) {
+        auto it = op_map.find(op);
+        if (it != op_map.end()) {
+          it->second->AddVar(var_name);
+        } else {
+          auto *eager_deletion_node = graph->CreateEmptyNode(
+              "eager_deletion", ir::Node::Type::kOperation);
+          auto *eager_deletion_op = new EagerDeletionOpHandle(
+              eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name},
+              gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()]));
+          AddDependencyBetween(op, eager_deletion_op, graph.get());
+          op_map[op] = eager_deletion_op;
+        }
+      }
+    }
+  }
+  VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h
new file mode 100644
index 0000000000..d7a7a9709d
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index a36ad25926..97830386e4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -562,7 +562,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     int dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
@@ -685,8 +685,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
     CreateOpHandleIOs(result, node, scope_idx);
   }
 }
diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h
deleted file mode 100644
index cc4ccfbdfc..0000000000
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 08783fb5f8..f094c7afa9 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -17,184 +17,96 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
-  std::queue<VarHandleBase *> queue;
-  queue.push(var_in);
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
   do {
-    auto *var = queue.front();
-    queue.pop();
-    for (auto *op : var->PendingOps()) {
-      auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
-        return compute_op;
-      }
-      for (auto *out_var : op->Outputs()) {
-        queue.push(out_var);
+    auto *op = q.front();
+    q.pop();
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
+      return compute_op;
+    }
+    for (auto *out_var : op->Outputs()) {
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
       }
     }
-  } while (!queue.empty());
+  } while (!q.empty());
   return nullptr;
 }
 
-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
-  }
-}
-
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
-  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
-  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
-
-  // It is not easy to find the right reference counts of varaibles in graph
-  // Step 1: Find all variables in computation ops
-  // Step 2: Find all variables in non-computation ops which refers to variables
-  // in computation ops
-  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
-
-  auto get_ref_cnts_from_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr ||
-        !platform::is_gpu_place(compute_op->GetPlace()))
-      return var_names_in_op;
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      if (!platform::is_gpu_place(var_handle->place_) ||
-          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
+  auto &vars = graph->Get<GraphVars>(kGraphVars);
+  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
+  auto &last_live_ops_of_vars =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
+  ref_cnts = std::vector<ReferenceCountMap>(vars.size());
+
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      if (name_var_pair.second.empty()) continue;
+      auto *last_ver_var = name_var_pair.second.back();
+
+      VarDesc *var_desc = nullptr;
+      std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(),
+                   [&](VarHandle *var_handle) -> bool {
+                     var_desc = var_handle->Node()->Var();
+                     return var_desc != nullptr;
+                   });
+
+      if (var_desc == nullptr || var_desc->Persistable()) {
         continue;
-
-      VarDesc *var_desc = var_handle->Node()->Var();
-      auto var_name = var_handle->Node()->Name();
-
-      // This is weird but there is really some variables without var_desc
-      // in computation_op
-      if (var_desc == nullptr) {
-        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-        if (var_desc == nullptr) continue;
       }
 
-      if (var_desc->Persistable()) continue;
       auto var_type = var_desc->Proto()->type().type();
       if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS) {
+          var_type != proto::VarType::SELECTED_ROWS &&
+          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
         continue;
       }
 
-      // compute op only runs in one device
-      if (ref_cnts[place.device]->count(var_name))
-        ++(*ref_cnts[place.device])[var_name];
-      else
-        (*ref_cnts[place.device])[var_name] = 1;
-
-      names.insert(var_name);
-      var_names_in_op.push_back(var_name);
-    }
-    return var_names_in_op;
-  };
-
-  auto update_ref_cnts_from_non_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      auto var_name = var_handle->Node()->Name();
-      auto var_place = var_handle->place_;
-      if (!platform::is_gpu_place(var_place)) continue;
-      auto place = boost::get<platform::CUDAPlace>(var_place);
-      if (names.count(var_name) == 0) continue;
-      if (ref_cnts.count(place.device) &&
-          ref_cnts[place.device]->count(var_name)) {
-        ++(*ref_cnts[place.device])[var_name];
-
-        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
-        if (next_compute_op != nullptr) {
-          if (compute_ref_cnt_map.count(next_compute_op)) {
-            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
-          } else {
-            // Create new reference_count_op_handle
-            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
-                "reference_count", ir::Node::Type::kOperation);
-            auto *ref_cnt_handle = new ReferenceCountOpHandle(
-                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
-                gcs[place.device].get(), cur_ref_cnts[place.device].get());
-            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
-          }
+      std::unordered_set<ComputationOpHandle *> last_live_op;
+      auto add_last_live_op = [&](OpHandleBase *op) {
+        auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
+        if (compute_op) {
+          last_live_op.insert(compute_op);
+        }
+      };
+      const std::string &var_name = name_var_pair.first;
+      auto &pending_ops = last_ver_var->PendingOps();
+      if (pending_ops.empty()) {
+        auto *generated_op = last_ver_var->GeneratedOp();
+        if (generated_op) {
+          ref_cnts[i].emplace(var_name, 1);
+          add_last_live_op(generated_op);
+        }
+      } else {
+        ref_cnts[i].emplace(var_name, pending_ops.size());
+        for (auto *pending_op : pending_ops) {
+          add_last_live_op(pending_op);
         }
       }
-    }
-  };
 
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto &op : all_ops) {
-    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
-    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
-    if (in_var_names.empty() && out_var_names.empty()) continue;
-    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
-                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    ir::Node *ref_cnt_node =
-        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
-    auto *ref_cnt_handle = new ReferenceCountOpHandle(
-        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
-        gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
-  }
-
-  for (auto &op : all_ops) {
-    update_ref_cnts_from_non_compute_op(op, op->Inputs());
-    update_ref_cnts_from_non_compute_op(op, op->Outputs());
-  }
-
-  std::vector<OpHandleBase *> new_all_ops;
-  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
-  for (auto &op : all_ops) {
-    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back());
-    if (it != compute_ref_cnt_map.end()) {
-      // Add LeafNode to ReferenceCountOpHandle
-      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-      it->second->AddOutput(dummy_leaf);
-      new_all_ops.emplace_back(std::move(it->second));
+      last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
     }
   }
-
-  all_ops.swap(new_all_ops);
   return graph;
 }
 
@@ -205,5 +117,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
 REGISTER_PASS(reference_count_pass,
               paddle::framework::details::ReferenceCountPass)
     .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars);
diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h
index 7081280b06..bcbef02735 100644
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
@@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
new file mode 100644
index 0000000000..77846f7bdf
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorList =
+    std::vector<std::unique_ptr<GarbageCollector<Tensor>>>;
+
+const char kGlobalReferenceCount[] = "reference_count";
+const char kCurReferenceCount[] = "current_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index e5b1eaa731..f1bf6542a3 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -18,9 +18,6 @@
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -33,7 +30,11 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       var_infos_(std::move(var_infos)),
-      places_(std::move(places)) {}
+      places_(std::move(places)) {
+  if (Graph().Has(details::kGarbageCollector)) {
+    gc_ = &(Graph().Get<GarbageCollectorList>(details::kGarbageCollector));
+  }
+}
 
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -69,27 +70,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
 
-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
+    for (size_t i = 0; i < places_.size(); ++i) {
+      platform::DeviceContextPool::Instance().Get(places_[i])->Wait();
+      if (gc_) {
+        (*gc_)[i]->Wait();
+        (*gc_)[i]->Reset();
       }
-#endif
     }
     for (auto &scope : local_scopes_) {
       auto &local_scope =
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..ce3061d6e6 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -21,9 +21,11 @@
 #include "paddle/fluid/framework/details/var_handle.h"
 
 #include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -55,6 +57,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
+
+  GarbageCollectorList* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 818b3334ea..cbe8f606ef 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -65,7 +65,7 @@ class GarbageCollector {
 
     if (clear_deque != nullptr) {
       callback();
-      ClearCallback([=]() {
+      ClearCallback([clear_deque]() {
         for (auto *obj : *clear_deque) obj->clear();
       });
     }
@@ -109,7 +109,6 @@ class DefaultStreamGarbageCollector : public GarbageCollector<T> {
   }
 
   void Wait() const override {
-    this->dev_ctx_->Wait();
     static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
         ->WaitStreamCallback();
   }
@@ -127,14 +126,14 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   StreamGarbageCollector(const platform::CUDAPlace &place,
                          size_t max_memory_size)
       : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    platform::SetDeviceId(place.device);
     PADDLE_ENFORCE(cudaStreamCreate(&stream_));
     callback_manager_.reset(new platform::StreamCallbackManager(stream_));
   }
 
   ~StreamGarbageCollector() {
     auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    platform::SetDeviceId(place.device);
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaStreamDestroy(stream_));
   }
@@ -148,8 +147,11 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   cudaStream_t stream() const { return stream_; }
 
  protected:
+  // ClearCallback and Wait()/Reset() cannot be call in multiple threads
+  // But it is not important, because they would not be called in multiple
+  // threads
+  // either in Executor or ParallelExecutor
   void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
     callback_manager_->AddCallback(callback);
   }
 
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 947c934f0f..7a2560c14d 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -73,14 +73,21 @@ class Graph {
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                    attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   template <typename AttrType>
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index a3559247db..27746ff145 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -51,11 +51,18 @@ class Pass {
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                    "%s attr not registered for pass.", attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   void Erase(const std::string &attr_name) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b98408ee77..e71f93beef 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -49,6 +50,15 @@ class ParallelExecutorPrivate {
       }
     }
   }
+
+  void ResetRuntimeReferenceCount() {
+    for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) {
+      for (auto &pair : rt_ref_cnts_[i]) {
+        rt_cur_ref_cnts_[i][pair.first] = pair.second;
+      }
+    }
+  }
+
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
   Scope *global_scope_;  // not owned
@@ -60,6 +70,13 @@ class ParallelExecutorPrivate {
   bool own_local_scope_;
   bool use_cuda_;
   bool use_all_reduce_;
+
+  // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then
+  // keeps unchanged
+  // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_
+  std::vector<details::ReferenceCountMap> rt_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> rt_cur_ref_cnts_;
+  details::GarbageCollectorList gcs_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -128,35 +145,56 @@ ParallelExecutor::ParallelExecutor(
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, params,
       member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
+#else
+  std::unique_ptr<ir::Graph> graph =
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                           params, member_->local_scopes_, member_->use_cuda_);
+#endif
 
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    for (auto &place : member_->places_) {
-      if (!platform::is_gpu_place(place)) continue;
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      if (gcs_[gpu_place.device] == nullptr) {
-        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
-        cur_ref_cnts_[gpu_place.device].reset(
-            new details::AtomicReferenceCountMap());
-        gcs_[gpu_place.device].reset(
-            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
+    size_t place_num = member_->places_.size();
+    for (size_t i = 0; i < place_num; ++i) {
+      auto &place = member_->places_[i];
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(place)) {
+        member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+      } else if (platform::is_cpu_place(place)) {
+#endif
+        member_->gcs_.emplace_back(new CPUGarbageCollector<Tensor>(
+            boost::get<platform::CPUPlace>(place), max_memory_size));
+        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#ifdef PADDLE_WITH_CUDA
       }
-    }
-    if (!gcs_.empty()) {
-      auto ref_cnt_pass =
-          ir::PassRegistry::Instance().Get("reference_count_pass");
-      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = ref_cnt_pass->Apply(std::move(graph));
-      graph->SetNotOwned("garbage_collector", &gcs_);
+#endif
     }
   }
-#else
-  std::unique_ptr<ir::Graph> graph =
-      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                           params, member_->local_scopes_, member_->use_cuda_);
-#endif
+
+  if (!member_->gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &(member_->rt_ref_cnts_));
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    VLOG(10) << "ReferenceCountPass Applied";
+    graph = ref_cnt_pass->Apply(std::move(graph));
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kCurReferenceCount,
+                                     &(member_->rt_cur_ref_cnts_));
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector,
+                                     &(member_->gcs_));
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+  }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
@@ -271,18 +309,16 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-#ifdef PADDLE_WITH_CUDA
-  if (!gcs_.empty()) {
-    ResetReferenceCount();
-    for (auto &pair : cur_ref_cnts_) {
-      auto &name_map = *(pair.second);
+  if (!member_->gcs_.empty()) {
+    member_->ResetRuntimeReferenceCount();
+    size_t n = member_->rt_ref_cnts_.size();
+    for (size_t i = 0; i < n; ++i) {
       for (auto &fetch_name : fetch_tensors) {
-        name_map.erase(fetch_name);
+        member_->rt_cur_ref_cnts_[i].erase(fetch_name);
       }
-      name_map.erase(fetched_var_name);
+      member_->rt_cur_ref_cnts_[i].erase(fetched_var_name);
     }
   }
-#endif
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
@@ -326,13 +362,11 @@ ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-  // member_ must be destructed before gcs_ since the destructor of
-  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
-  member_.reset();
+  delete member_;
 }
 
 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
+
 USE_PASS(reference_count_pass);
-#endif
+USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ef09b98b2a..1fc17a0d64 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_pass.h"
-#endif
-
 namespace paddle {
 namespace framework {
 
@@ -75,24 +70,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  std::unique_ptr<ParallelExecutorPrivate> member_;
-
-#ifdef PADDLE_WITH_CUDA
-  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
-  details::DeviceReferenceCountMap ref_cnts_;
-  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
-  details::DeviceGarbageCollectorMap gcs_;
-
-  void ResetReferenceCount() {
-    for (auto &pair1 : ref_cnts_) {
-      for (auto &pair2 : *(pair1.second)) {
-        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
-      }
-    }
-  }
-#endif
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 93cb5eb2dc..23c7ebe842 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,9 +56,16 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+IF(WITH_GPU)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSE()
+  set(STREAM_CALLBACK_DEPS)
+ENDIF()
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
new file mode 100644
index 0000000000..ae915365f8
--- /dev/null
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+struct StreamCallbackContext {
+  inline StreamCallbackContext(const StreamCallbackManager *manager,
+                               std::function<void()> callback)
+      : manager_(manager), callback_(std::move(callback)) {}
+
+  const StreamCallbackManager *manager_;  // do not own
+  std::function<void()> callback_;
+};
+
+StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+    : stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
+
+void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *stream_callback_context =
+      new StreamCallbackContext(this, std::move(callback));
+#if CUDA_VERSION >= 10000
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                    StreamCallbackManager::StreamCallbackFunc,
+                                    stream_callback_context));
+#else
+  PADDLE_ENFORCE(
+      cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc,
+                            stream_callback_context, 0));
+#endif
+}
+
+void StreamCallbackManager::Wait() const {
+  thread_pool_.reset(new ::ThreadPool(1));
+}
+
+#if CUDA_VERSION >= 10000
+void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data)
+#else
+void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream,
+                                                         cudaError_t status,
+                                                         void *user_data)
+#endif
+{
+  auto *callback_context_ptr =
+      reinterpret_cast<StreamCallbackContext *>(user_data);
+  callback_context_ptr->manager_->thread_pool_->enqueue(
+      [callback_context_ptr]() {
+        std::unique_ptr<StreamCallbackContext> callback_context(
+            callback_context_ptr);
+        callback_context->callback_();
+      });
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index ed8734c98c..eac4806d13 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -19,66 +19,29 @@
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-class StreamCallbackManager;
-
-struct StreamCallbackContext {
-  template <typename Callback>
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               Callback &&callback)
-      : manager_(manager), callback_(callback) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
-
+// NOTE(zjl): clean StreamCallback to make compilation faster
 class StreamCallbackManager {
  public:
-  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
-      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+  explicit StreamCallbackManager(const cudaStream_t stream);
 
-  template <typename Callback>
-  inline void AddCallback(Callback &&callback) const {
-    auto *stream_callback_context =
-        new StreamCallbackContext(this, std::forward<Callback>(callback));
-#if CUDA_VERSION >= 10000
-    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                      StreamCallbackManager::StreamCallbackFunc,
-                                      stream_callback_context));  // NOLINT
-#else
-    PADDLE_ENFORCE(cudaStreamAddCallback(
-        stream_, StreamCallbackManager::StreamCallbackFunc,
-        stream_callback_context, 0));  // NOLINT
-#endif
-  }
+  void AddCallback(std::function<void()> callback) const;
 
-  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+  void Wait() const;
 
  private:
   const cudaStream_t stream_;
-  mutable std::unique_ptr<ThreadPool> thread_pool_;
+  mutable std::unique_ptr<::ThreadPool> thread_pool_;
 
-// cudaStreamCallback cannot call CUDA API inside, so we have to use
-// thread_pool here
 #if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data)
+  static void CUDART_CB StreamCallbackFunc(void *user_data);
 #else
   static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data)
+                                           cudaError_t status, void *user_data);
 #endif
-  {
-    auto *callback_context_ptr =
-        reinterpret_cast<StreamCallbackContext *>(user_data);
-    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
-      std::unique_ptr<StreamCallbackContext> callback_context(
-          callback_context_ptr);
-      callback_context->callback_();
-    });
-  }
 };
 
 }  // namespace platform

From c47c451a007f33078bfb8f38be4a6cd50922f361 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 3 Dec 2018 11:45:53 +0000
Subject: [PATCH 02/14] fix bug

---
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../details/computation_op_handle.cc          |   2 +
 .../details/eager_deletion_op_handle.cc       |  23 ++--
 .../details/eager_deletion_op_handle.h        |   8 +-
 .../framework/details/eager_deletion_pass.cc  |  81 ++++++------
 .../fluid/framework/details/op_graph_view.h   |  29 +++-
 .../framework/details/reference_count_pass.cc | 125 ++++++++++++++++--
 .../scope_buffered_ssa_graph_executor.cc      |  21 ++-
 .../scope_buffered_ssa_graph_executor.h       |   2 +
 paddle/fluid/framework/executor.cc            | 104 +++++++++++----
 paddle/fluid/framework/executor.h             |  51 ++-----
 paddle/fluid/framework/garbage_collector.h    |  44 +++---
 paddle/fluid/framework/operator.cc            |   2 +
 paddle/fluid/framework/parallel_executor.cc   |  13 +-
 paddle/fluid/framework/scope.cc               |   6 +
 paddle/fluid/framework/scope.h                |   1 +
 paddle/fluid/framework/tensor.h               |   2 +-
 .../fluid/operators/controlflow/while_op.cc   |  44 +++++-
 paddle/fluid/operators/reader/ctr_reader.h    |  12 +-
 paddle/fluid/platform/device_context.h        |  10 +-
 .../fluid/platform/stream_callback_manager.cc |  67 +++++-----
 .../fluid/platform/stream_callback_manager.h  |  20 +--
 paddle/fluid/pybind/tensor_py.h               |  12 +-
 python/paddle/fluid/__init__.py               |   5 +-
 24 files changed, 458 insertions(+), 228 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8cf97d667d..8049f5d3f7 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -35,7 +35,7 @@ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_e
 
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7beb8c8de9..2bf43fd4e0 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -31,6 +31,8 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
 
+  VLOG(10) << "Run Op" << Name();
+
   auto run_func = [this]() {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   };
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index cd26203376..41f616035d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 
 namespace paddle {
 namespace framework {
@@ -23,28 +24,32 @@ namespace details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node, const Scope *scope, const platform::Place &place,
-    const std::vector<std::string> &var_names, GarbageCollector<Tensor> *gc,
-    AtomicReferenceCountMap *ref_cnts)
-    : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
+    const std::unordered_set<std::string> &var_names,
+    GarbageCollector<Tensor> *gc, AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
-      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place).device);
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
     }
   }
 #endif
-
-  for (auto &name : var_names) AddVar(name);
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 #ifdef PADDLE_WITH_CUDA
   if (event_) {
     auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-    platform::SetDeviceId(gpu_place.device);
+    platform::CUDADeviceGuard guard(gpu_place.device);
     PADDLE_ENFORCE(cudaEventDestroy(event_));
   }
 #endif
@@ -52,10 +57,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
-void EagerDeletionOpHandle::AddVar(const std::string &name) {
-  var_names_.insert(name);
-}
-
 void EagerDeletionOpHandle::RunImpl() {
   auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
   std::vector<Tensor *> tensors;
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 8254f21bdf..d8de59cc4d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -25,13 +25,11 @@ class Scope;
 
 namespace details {
 
-class EagerDeletionPass;
-
 class EagerDeletionOpHandle : public OpHandleBase {
  public:
   EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
                         const platform::Place &place,
-                        const std::vector<std::string> &var_names,
+                        const std::unordered_set<std::string> &var_names,
                         GarbageCollector<Tensor> *gc,
                         AtomicReferenceCountMap *ref_cnts);
 
@@ -45,8 +43,6 @@ class EagerDeletionOpHandle : public OpHandleBase {
  private:
   void ClearTensors(const std::vector<Tensor *> &tensors);
 
-  void AddVar(const std::string &name);
-
   const Scope *scope_;
   std::unordered_set<std::string> var_names_;
   GarbageCollector<Tensor> *gc_;       // not own
@@ -55,8 +51,6 @@ class EagerDeletionOpHandle : public OpHandleBase {
   platform::CUDADeviceContext *dev_ctx_{nullptr};
   cudaEvent_t event_{nullptr};
 #endif
-
-  friend class EagerDeletionPass;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index f877c2881c..3a1b37e533 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -26,62 +26,61 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
-  }
-
-  // Add leaf node to eager_deletion_node
-  if (out->Outputs().empty()) {
-    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-    out->AddOutput(dummy_leaf);
-  }
-}
-
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &vars = graph->Get<GraphVars>(kGraphVars);
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
   auto &ref_cnts =
       Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
-  auto &last_live_ops = Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
   auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
 
   ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
 
-  std::unordered_map<ComputationOpHandle *, EagerDeletionOpHandle *> op_map;
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
   for (auto &var_ops_map : last_live_ops) {
     for (auto &var_ops_pair : var_ops_map) {
       const std::string &var_name = var_ops_pair.first;
-      for (ComputationOpHandle *op : var_ops_pair.second) {
-        auto it = op_map.find(op);
-        if (it != op_map.end()) {
-          it->second->AddVar(var_name);
-        } else {
-          auto *eager_deletion_node = graph->CreateEmptyNode(
-              "eager_deletion", ir::Node::Type::kOperation);
-          auto *eager_deletion_op = new EagerDeletionOpHandle(
-              eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name},
-              gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()]));
-          AddDependencyBetween(op, eager_deletion_op, graph.get());
-          op_map[op] = eager_deletion_op;
-        }
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
       }
     }
   }
-  VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)";
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(),
+        std::move(var_names), gcs[op->GetScopeIdx()].get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h
index afb3e8e594..77aa02eba5 100644
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -34,6 +34,11 @@ class OpGraphView {
 
   bool HasOp(OpHandleBase *op) const;
 
+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
  private:
   void Build(const std::vector<OpHandleBase *> &ops);
   void EnforceHasOp(OpHandleBase *op) const;
@@ -44,6 +49,28 @@ class OpGraphView {
       pending_ops_;
 };
 
+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index f094c7afa9..2320d3926a 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -14,11 +14,13 @@
 
 #include <queue>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -27,6 +29,89 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+struct OpConnectionDetector {
+ public:
+  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
+
+  explicit OpConnectionDetector(const std::vector<OpHandleBase *> &all_ops)
+      : graph_(all_ops) {}
+
+  template <typename OpSet>
+  std::unordered_set<typename OpSet::key_type> MaxNoDepOps(
+      const OpSet &op_set) {
+    using KeyType = typename OpSet::key_type;
+    static_assert(
+        std::is_base_of<OpHandleBase,
+                        typename std::remove_pointer<KeyType>::type>::value,
+        "Key type of OpSet must be or derived of OpHandleBase");
+
+    std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
+    std::unordered_set<KeyType> ret;
+    auto rels = GetRelations(ops);
+    auto not_before = [](RelationShip r) { return r != kBefore; };
+    for (size_t i = 0; i < rels.size(); ++i) {
+      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
+        ret.insert(static_cast<KeyType>(ops[i]));
+      }
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::vector<RelationShip>> GetRelations(
+      const std::vector<OpHandleBase *> ops) {
+    std::unordered_map<OpHandleBase *, size_t> op_to_idx;
+    for (size_t i = 0; i < ops.size(); ++i) {
+      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      op_to_idx[ops[i]] = i;
+    }
+
+    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+
+    std::vector<std::vector<RelationShip>> ret(ops.size());
+    for (auto &e : ret) {
+      e.assign(ops.size(), kSame);
+    }
+
+    size_t found_num = ops.size();
+    size_t total_num = ops.size() * ops.size();
+    auto visitor = [&](OpHandleBase *op, size_t i) {
+      auto it = op_to_idx.find(op);
+      if (it != op_to_idx.end()) {
+        size_t j = it->second;
+        if (ret[i][j] != kSame) {
+          ret[i][j] = kBefore;
+          ret[j][i] = kAfter;
+          found_num += 2;
+          if (found_num == total_num) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); };
+      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      for (size_t j = i + 1; j < ops.size(); ++j) {
+        if (ret[i][j] != kSame) continue;
+        ret[i][j] = kNoDeps;
+        ret[j][i] = kNoDeps;
+      }
+    }
+
+    return ret;
+  }
+
+  const OpGraphView graph_;
+};
+
 static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
     OpHandleBase *op, size_t scope_idx) {
   std::queue<OpHandleBase *> q;
@@ -59,9 +144,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
   ref_cnts = std::vector<ReferenceCountMap>(vars.size());
 
+  OpConnectionDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+
   for (size_t i = 0; i < vars.size(); ++i) {
     for (auto &name_var_pair : vars[i]) {
-      if (name_var_pair.second.empty()) continue;
+      if (name_var_pair.second.empty()) {
+        continue;
+      }
+
+      const std::string &var_name = name_var_pair.first;
       auto *last_ver_var = name_var_pair.second.back();
 
       VarDesc *var_desc = nullptr;
@@ -83,30 +174,46 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       }
 
       std::unordered_set<ComputationOpHandle *> last_live_op;
-      auto add_last_live_op = [&](OpHandleBase *op) {
+      auto add_last_live_op = [&](OpHandleBase *op) -> bool {
         auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
         if (compute_op) {
           last_live_op.insert(compute_op);
+          return true;
+        } else {
+          return false;
         }
       };
-      const std::string &var_name = name_var_pair.first;
+
+      bool can_delete = false;
       auto &pending_ops = last_ver_var->PendingOps();
       if (pending_ops.empty()) {
         auto *generated_op = last_ver_var->GeneratedOp();
-        if (generated_op) {
-          ref_cnts[i].emplace(var_name, 1);
-          add_last_live_op(generated_op);
+        if (generated_op && add_last_live_op(generated_op)) {
+          can_delete = true;
         }
       } else {
-        ref_cnts[i].emplace(var_name, pending_ops.size());
+        can_delete = true;
         for (auto *pending_op : pending_ops) {
-          add_last_live_op(pending_op);
+          if (!add_last_live_op(pending_op)) {
+            can_delete = false;
+            break;
+          }
         }
       }
 
-      last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      if (can_delete) {
+        size_t original_size = last_live_op.size();
+        last_live_op = detector.MaxNoDepOps(last_live_op);
+        if (last_live_op.size() != original_size) {
+          VLOG(10) << "Shrink last living op number of " << var_name << " from "
+                   << original_size << " to " << last_live_op.size();
+        }
+        ref_cnts[i].emplace(var_name, last_live_op.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      }
     }
   }
+
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index f1bf6542a3..0cc3ac8bfb 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -36,6 +36,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
   }
 }
 
+void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
+  if (gc_) {
+    for (auto &gc : *gc_) {
+      gc->Wait();
+      gc->Reset();
+    }
+  }
+}
+
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   if (drop_scope_counter_ == 0) {
@@ -74,19 +83,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (size_t i = 0; i < places_.size(); ++i) {
-      platform::DeviceContextPool::Instance().Get(places_[i])->Wait();
-      if (gc_) {
-        (*gc_)[i]->Wait();
-        (*gc_)[i]->Reset();
-      }
+    for (auto &p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
+    WaitAllGarbageCollectors();
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
+  } else {
+    WaitAllGarbageCollectors();
   }
+
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ce3061d6e6..4d52183a20 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -50,6 +50,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
+  void WaitAllGarbageCollectors();
+
   size_t drop_scope_counter_{0};
 
   ExecutionStrategy strategy_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 96132a2c18..02d1e4114e 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -37,11 +37,49 @@ namespace {
 int kProgramId = -1;
 }  // namespace
 
+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
+        }
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
     : prog_(prog), block_id_(block_id) {
   if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                 skip_ref_cnt_vars);
   }
 }
 
@@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector<Tensor>* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
   std::unordered_set<Tensor*> erase_tensors;
 
   auto handler = [&](const VariableNameMap& name_map) {
@@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
       for (auto& name : name_pair.second) {
         auto it = ref_cnts->find(name);
         if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
+        if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
             VLOG(10) << "Erase tensor \'" << name << "\'";
@@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
             } else if (var->IsType<SelectedRows>()) {
               erase_tensors.insert(
                   var->GetMutable<SelectedRows>()->mutable_value());
+            } else if (var->IsType<LoDTensorArray>()) {
+              auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+              for (auto& t : *lod_tensor_arr) {
+                erase_tensors.insert(&t);
+              }
             }
           }
         }
@@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
   std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
@@ -364,16 +407,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
   std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
   for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
     PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
     for (auto& op_desc : block.AllOps()) {
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
     }
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
   }
   return result;
 }
@@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
-  if (max_memory_size >= 0 && !keep_kids) {
+  if (max_memory_size >= 0) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
-          boost::get<platform::CUDAPlace>(place_), max_memory_size));
-    } else {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      } else {
+        gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
       gc.reset(new CPUGarbageCollector<Tensor>(
           boost::get<platform::CPUPlace>(place_), max_memory_size));
@@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
 
-    if (gc != nullptr) {
+    if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                           &(ctx->cur_ref_cnts_));
     }
   }
 
-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  if (gc) gc->Wait();
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2..f00d4314b6 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -28,42 +28,11 @@ namespace paddle {
 namespace framework {
 extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
-
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
   ~ExecutorPrepareContext();
 
   void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
@@ -72,8 +41,8 @@ struct ExecutorPrepareContext {
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> ref_cnts_;
+  std::unordered_map<std::string, size_t> cur_ref_cnts_;
 };
 
 class Executor {
@@ -109,10 +78,14 @@ class Executor {
            const std::string& fetch_holder_name = "fetch");
 
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());
 
   static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());
 
   void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index cbe8f606ef..1382e0d461 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -19,6 +19,9 @@
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -36,6 +39,11 @@ class GarbageCollector {
 
   virtual ~GarbageCollector() {}
 
+  size_t NumOfGarbages() const {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return garbages_->size();
+  }
+
   void Reset() {
     std::lock_guard<std::mutex> guard(mutex_);
     garbages_.reset(new std::deque<T *>());
@@ -49,7 +57,7 @@ class GarbageCollector {
 
   template <typename Container, typename Callback>
   void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
+    std::deque<T *> *clear_deque = nullptr;
     {
       std::lock_guard<std::mutex> guard(mutex_);
       for (auto *obj : objs) {
@@ -58,7 +66,7 @@ class GarbageCollector {
       }
       if (cur_memory_size_ >= max_memory_size_) {
         cur_memory_size_ = 0;
-        clear_deque = garbages_;
+        clear_deque = garbages_.release();
         garbages_.reset(new std::deque<T *>());
       }
     }
@@ -67,6 +75,7 @@ class GarbageCollector {
       callback();
       ClearCallback([clear_deque]() {
         for (auto *obj : *clear_deque) obj->clear();
+        delete clear_deque;
       });
     }
   }
@@ -77,7 +86,7 @@ class GarbageCollector {
   virtual void ClearCallback(const std::function<void()> &callback) = 0;
 
   platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<std::deque<T *>> garbages_;
   mutable std::mutex mutex_;
   const size_t max_memory_size_;
   size_t cur_memory_size_ = 0;
@@ -96,6 +105,19 @@ class CPUGarbageCollector : public GarbageCollector<T> {
 };
 
 #ifdef PADDLE_WITH_CUDA
+template <typename T>
+class UnsafeFastGPUGarbageCollector : public GarbageCollector<T> {
+ public:
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    callback();
+  }
+};
+
 template <typename T>
 class DefaultStreamGarbageCollector : public GarbageCollector<T> {
  public:
@@ -109,7 +131,7 @@ class DefaultStreamGarbageCollector : public GarbageCollector<T> {
   }
 
   void Wait() const override {
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
         ->WaitStreamCallback();
   }
 
@@ -126,31 +148,23 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   StreamGarbageCollector(const platform::CUDAPlace &place,
                          size_t max_memory_size)
       : GarbageCollector<T>(place, max_memory_size) {
-    platform::SetDeviceId(place.device);
+    platform::CUDADeviceGuard guard(place.device);
     PADDLE_ENFORCE(cudaStreamCreate(&stream_));
     callback_manager_.reset(new platform::StreamCallbackManager(stream_));
   }
 
   ~StreamGarbageCollector() {
     auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    platform::SetDeviceId(place.device);
+    platform::CUDADeviceGuard guard(place.device);
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaStreamDestroy(stream_));
   }
 
-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override { callback_manager_->Wait(); }
 
   cudaStream_t stream() const { return stream_; }
 
  protected:
-  // ClearCallback and Wait()/Reset() cannot be call in multiple threads
-  // But it is not important, because they would not be called in multiple
-  // threads
-  // either in Executor or ParallelExecutor
   void ClearCallback(const std::function<void()> &callback) override {
     callback_manager_->AddCallback(callback);
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8bfdf38912..a5f714fc89 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -873,6 +873,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+                         ipt_name, DebugString());
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e71f93beef..3d466e44a1 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -158,8 +158,13 @@ ParallelExecutor::ParallelExecutor(
       auto &place = member_->places_[i];
 #ifdef PADDLE_WITH_CUDA
       if (platform::is_gpu_place(place)) {
-        member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size));
+        if (IsFastEagerDeletionModeEnabled()) {
+          member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector<Tensor>(
+              boost::get<platform::CUDAPlace>(place), max_memory_size));
+        } else {
+          member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
+              boost::get<platform::CUDAPlace>(place), max_memory_size));
+        }
         VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
       } else if (platform::is_cpu_place(place)) {
 #endif
@@ -181,8 +186,8 @@ ParallelExecutor::ParallelExecutor(
                               &(member_->rt_ref_cnts_));
     ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
                               &last_live_ops_of_vars);
-    VLOG(10) << "ReferenceCountPass Applied";
     graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
 
     auto eager_deletion_pass =
         ir::PassRegistry::Instance().Get("eager_deletion_pass");
@@ -194,6 +199,8 @@ ParallelExecutor::ParallelExecutor(
                                      &last_live_ops_of_vars);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
+
+    graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0d261dd7cc..cb3b6cdc3e 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,6 +38,10 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
                                     (static_cast<int64_t>(1) << 30));
 }
 
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 1901ffbe57..aded1f771c 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -27,6 +27,7 @@ namespace paddle {
 namespace framework {
 
 int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b..3a4c52410e 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -153,7 +153,7 @@ class Tensor {
 
   void set_layout(const DataLayout layout) { layout_ = layout; }
 
-  void clear() { holder_ = nullptr; }
+  void clear() { holder_.reset(); }
 
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 6c1b2f329a..d8410b4058 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -59,7 +59,21 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto ctx = executor.Prepare(*program, block->ID());
+    auto &skip_eager_deletion_vars =
+        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
+    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
+      std::string debug_string =
+          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
+          " vars in eager deletion mode: ";
+      for (auto &var : skip_eager_deletion_vars) {
+        debug_string.append(var);
+        debug_string.push_back(' ');
+      }
+      VLOG(10) << debug_string;
+    }
+
+    auto ctx =
+        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -96,6 +110,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
+    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+                                      "Vars that would skip eager deletion."
+                                      "Users should not set this manually.")
+        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(
 )DOC");
   }
@@ -341,6 +359,30 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
+    /* The following codes are used in eager deletion mode */
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      std::unordered_set<std::string> skip_vars;
+      for (auto *op_desc : grad_block->AllOps()) {
+        for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+          // If input var of ops inside grad_block is not from grad_block,
+          // it cannot be deleted when forward while_op runs
+          if (in_arg_name != framework::kEmptyVarName &&
+              !grad_block->HasVar(in_arg_name)) {
+            skip_vars.insert(in_arg_name);
+          }
+        }
+      }
+
+      if (!skip_vars.empty()) {
+        // FIXME(zjl): ugly const_cast here, maybe we should find a better way
+        // to modify forward while_op
+        auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
+        fwd_while_op.SetAttr(
+            "skip_eager_deletion_vars",
+            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+      }
+    }
+
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
 };
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 9b2a11bae1..7fc07efe73 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -16,6 +16,7 @@
 
 #include <sys/time.h>
 
+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <cstdlib>
 #include <fstream>
@@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader {
     PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ =
-        file_list_.size() > thread_num ? thread_num : file_list_.size();
+    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
     queue_ = queue;
     SplitFiles();
     for (size_t i = 0; i < thread_num_; ++i) {
@@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
-      read_threads_.emplace_back(new std::thread(
-          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
-                    thread_id, &read_thread_status_, queue_)));
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+      read_threads_.emplace_back(new std::thread(std::bind(
+          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          static_cast<int>(thread_id), &read_thread_status_, queue_)));
     }
     monitor_thread_.reset(new std::thread(
         std::bind(&MonitorThread, &read_thread_status_, queue_)));
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3edd727978..37453a8c29 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -223,14 +223,10 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
     callback_manager_->AddCallback(callback);
   }
 
-  void WaitStreamCallback() const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
-    callback_manager_->Wait();
-  }
+  void WaitStreamCallback() const { callback_manager_->Wait(); }
 
 #if CUDA_VERSION >= 9000
   /*! \brief CublasCall may need to change cublas's config,
@@ -261,9 +257,7 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex mtx_;
 
-  // This lock is only used by callback
-  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
-  mutable std::mutex callback_mtx_;
+  // StreamCallbackManager is thread-safe
   std::unique_ptr<StreamCallbackManager> callback_manager_;
 
   mutable std::mutex cublas_mtx_;
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index ae915365f8..58ec6f2f5d 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -18,52 +18,47 @@
 namespace paddle {
 namespace platform {
 
-struct StreamCallbackContext {
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               std::function<void()> callback)
-      : manager_(manager), callback_(std::move(callback)) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
+#if CUDA_VERSION >= 10000
+static void CUDART_CB StreamCallbackFunc(void *user_data);
+#else
+static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                         cudaError_t status, void *user_data)
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()> *>(user_data));
+  (*func)();
+}
 
 StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
-    : stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
+    : stream_(stream), thread_pool_(1) {}
 
 void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
-  auto *stream_callback_context =
-      new StreamCallbackContext(this, std::move(callback));
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
 #if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                    StreamCallbackManager::StreamCallbackFunc,
-                                    stream_callback_context));
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
-  PADDLE_ENFORCE(
-      cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc,
-                            stream_callback_context, 0));
+  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 }
 
-void StreamCallbackManager::Wait() const {
-  thread_pool_.reset(new ::ThreadPool(1));
-}
+StreamCallbackManager::~StreamCallbackManager() { Wait(); }
 
-#if CUDA_VERSION >= 10000
-void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data)
-#else
-void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream,
-                                                         cudaError_t status,
-                                                         void *user_data)
-#endif
-{
-  auto *callback_context_ptr =
-      reinterpret_cast<StreamCallbackContext *>(user_data);
-  callback_context_ptr->manager_->thread_pool_->enqueue(
-      [callback_context_ptr]() {
-        std::unique_ptr<StreamCallbackContext> callback_context(
-            callback_context_ptr);
-        callback_context->callback_();
-      });
+void StreamCallbackManager::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index eac4806d13..0d5d85bf46 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,30 +18,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
+#include <future>  // NOLINT
 #include <memory>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-// NOTE(zjl): clean StreamCallback to make compilation faster
+// NOTE(zjl): clean StreamCallbackManager to make compilation faster
+// Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
  public:
   explicit StreamCallbackManager(const cudaStream_t stream);
 
+  ~StreamCallbackManager();
+
   void AddCallback(std::function<void()> callback) const;
 
   void Wait() const;
 
  private:
   const cudaStream_t stream_;
-  mutable std::unique_ptr<::ThreadPool> thread_pool_;
-
-#if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data);
-#else
-  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data);
-#endif
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 02a75236f6..24800e1709 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f7fefb3e5b..2690149e9b 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,8 +116,9 @@ def __bootstrap__():
         'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
         'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')

From 35a2578426840642acc0b2100be0b1c96c2cf1e9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 3 Dec 2018 13:21:49 +0000
Subject: [PATCH 03/14] fix bug test=develop

---
 .../framework/details/computation_op_handle.cc     |  2 --
 .../framework/details/reference_count_pass.cc      | 14 +++++++++-----
 paddle/fluid/platform/stream_callback_manager.cc   |  2 --
 paddle/fluid/platform/stream_callback_manager.h    |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 2bf43fd4e0..7beb8c8de9 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -31,8 +31,6 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
 
-  VLOG(10) << "Run Op" << Name();
-
   auto run_func = [this]() {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   };
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 2320d3926a..0c096e0980 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,7 +29,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-struct OpConnectionDetector {
+class OpConnectionDetector {
  public:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
@@ -37,8 +37,8 @@ struct OpConnectionDetector {
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  std::unordered_set<typename OpSet::key_type> MaxNoDepOps(
-      const OpSet &op_set) {
+  OpSet MaxNoDepOps(const OpSet &op_set) {
+    if (op_set.size() <= 1) return op_set;
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
@@ -46,7 +46,7 @@ struct OpConnectionDetector {
         "Key type of OpSet must be or derived of OpHandleBase");
 
     std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
-    std::unordered_set<KeyType> ret;
+    OpSet ret;
     auto rels = GetRelations(ops);
     auto not_before = [](RelationShip r) { return r != kBefore; };
     for (size_t i = 0; i < rels.size(); ++i) {
@@ -79,7 +79,7 @@ struct OpConnectionDetector {
       auto it = op_to_idx.find(op);
       if (it != op_to_idx.end()) {
         size_t j = it->second;
-        if (ret[i][j] != kSame) {
+        if (i != j && ret[i][j] == kSame) {
           ret[i][j] = kBefore;
           ret[j][i] = kAfter;
           found_num += 2;
@@ -208,6 +208,10 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
           VLOG(10) << "Shrink last living op number of " << var_name << " from "
                    << original_size << " to " << last_live_op.size();
         }
+
+        PADDLE_ENFORCE(!last_live_op.empty(),
+                       "Last living ops of %s cannot be empty", var_name);
+
         ref_cnts[i].emplace(var_name, last_live_op.size());
         last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
       }
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 58ec6f2f5d..466c77469e 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -49,8 +49,6 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
 #endif
 }
 
-StreamCallbackManager::~StreamCallbackManager() { Wait(); }
-
 void StreamCallbackManager::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   {
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 0d5d85bf46..8668bcb113 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -33,7 +33,7 @@ class StreamCallbackManager {
  public:
   explicit StreamCallbackManager(const cudaStream_t stream);
 
-  ~StreamCallbackManager();
+  ~StreamCallbackManager() = default;
 
   void AddCallback(std::function<void()> callback) const;
 

From 2d0d037d8e9e1580d38e800fd1a0d0b0056422eb Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 09:45:50 +0000
Subject: [PATCH 04/14] fix while_op eager deletion bug add unittest
 test=develop

---
 paddle/fluid/framework/executor.cc            |  2 +-
 .../fluid/operators/controlflow/while_op.cc   | 84 +++++++++++++------
 .../unittests/test_eager_deletion_mnist.py    | 27 ++++++
 .../test_eager_deletion_seresnext.py          | 27 ++++++
 .../test_eager_deletion_transformer.py        | 27 ++++++
 5 files changed, 140 insertions(+), 27 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5823f33034..f443c2d8cf 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -101,7 +101,7 @@ static void DeleteUnusedTensors(
         if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(2) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8410b4058..da7cad82d8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto &skip_eager_deletion_vars =
-        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
-    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
-      std::string debug_string =
-          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
-          " vars in eager deletion mode: ";
-      for (auto &var : skip_eager_deletion_vars) {
-        debug_string.append(var);
-        debug_string.push_back(' ');
-      }
-      VLOG(10) << debug_string;
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     }
 
-    auto ctx =
-        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
                                       "Users should not set this manually.")
         .SetDefault(std::vector<std::string>());
@@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    }
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
+    /* The followi_ng codes are used in eager deletion mode */
+    std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> skip_vars;
+      std::unordered_set<std::string> fwd_skip_vars;
       for (auto *op_desc : grad_block->AllOps()) {
+        auto skippable = [&](const std::string &name) {
+          return !grad_block->HasVar(name) &&
+                 (fwd_block->HasVarRecursive(name) ||
+                  parent_block->HasVarRecursive(name));
+        };
         for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          // If input var of ops inside grad_block is not from grad_block,
-          // it cannot be deleted when forward while_op runs
-          if (in_arg_name != framework::kEmptyVarName &&
-              !grad_block->HasVar(in_arg_name)) {
-            skip_vars.insert(in_arg_name);
+          if (skippable(in_arg_name)) {
+            fwd_skip_vars.insert(in_arg_name);
+          }
+        }
+
+        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+          if (skippable(out_arg_name)) {
+            fwd_skip_vars.insert(out_arg_name);
           }
         }
       }
 
-      if (!skip_vars.empty()) {
+      if (!fwd_skip_vars.empty()) {
         // FIXME(zjl): ugly const_cast here, maybe we should find a better way
         // to modify forward while_op
         auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(
-            "skip_eager_deletion_vars",
-            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
+                             std::vector<std::string>(fwd_skip_vars.begin(),
+                                                      fwd_skip_vars.end()));
+      }
+
+      // Find backward skip vars
+      auto fwd_input = Input(kX);
+      for (size_t i = 0; i < igs.size(); ++i) {
+        if (igs[i] == framework::kEmptyVarName) {
+          continue;
+        }
+        bwd_skip_vars.insert(igs[i]);
+        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
       }
     }
+    while_grad->SetAttr(
+        kSkipEagerDeletionVars,
+        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
new file mode 100644
index 0000000000..7ec1f0ae75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
new file mode 100644
index 0000000000..2dcdbdb8f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_seresnext import TestResnet
+
+
+class EagerDeletionTestSEResNext(TestResnet):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
new file mode 100644
index 0000000000..754d5fd409
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From e694d0c2e487a854103e0cc4796f92af6d27ccfd Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 09:45:50 +0000
Subject: [PATCH 05/14] fix while_op eager deletion bug add unittest
 test=develop

---
 .../details/eager_deletion_op_handle.cc       |  2 +
 paddle/fluid/framework/executor.cc            |  2 +-
 .../fluid/operators/controlflow/while_op.cc   | 84 +++++++++++++------
 .../unittests/test_eager_deletion_mnist.py    | 27 ++++++
 .../test_eager_deletion_seresnext.py          | 27 ++++++
 .../test_eager_deletion_transformer.py        | 27 ++++++
 6 files changed, 142 insertions(+), 27 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 41f616035d..54715fed8d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,9 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5823f33034..f443c2d8cf 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -101,7 +101,7 @@ static void DeleteUnusedTensors(
         if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(2) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8410b4058..da7cad82d8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto &skip_eager_deletion_vars =
-        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
-    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
-      std::string debug_string =
-          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
-          " vars in eager deletion mode: ";
-      for (auto &var : skip_eager_deletion_vars) {
-        debug_string.append(var);
-        debug_string.push_back(' ');
-      }
-      VLOG(10) << debug_string;
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     }
 
-    auto ctx =
-        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
                                       "Users should not set this manually.")
         .SetDefault(std::vector<std::string>());
@@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    }
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
+    /* The followi_ng codes are used in eager deletion mode */
+    std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> skip_vars;
+      std::unordered_set<std::string> fwd_skip_vars;
       for (auto *op_desc : grad_block->AllOps()) {
+        auto skippable = [&](const std::string &name) {
+          return !grad_block->HasVar(name) &&
+                 (fwd_block->HasVarRecursive(name) ||
+                  parent_block->HasVarRecursive(name));
+        };
         for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          // If input var of ops inside grad_block is not from grad_block,
-          // it cannot be deleted when forward while_op runs
-          if (in_arg_name != framework::kEmptyVarName &&
-              !grad_block->HasVar(in_arg_name)) {
-            skip_vars.insert(in_arg_name);
+          if (skippable(in_arg_name)) {
+            fwd_skip_vars.insert(in_arg_name);
+          }
+        }
+
+        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+          if (skippable(out_arg_name)) {
+            fwd_skip_vars.insert(out_arg_name);
           }
         }
       }
 
-      if (!skip_vars.empty()) {
+      if (!fwd_skip_vars.empty()) {
         // FIXME(zjl): ugly const_cast here, maybe we should find a better way
         // to modify forward while_op
         auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(
-            "skip_eager_deletion_vars",
-            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
+                             std::vector<std::string>(fwd_skip_vars.begin(),
+                                                      fwd_skip_vars.end()));
+      }
+
+      // Find backward skip vars
+      auto fwd_input = Input(kX);
+      for (size_t i = 0; i < igs.size(); ++i) {
+        if (igs[i] == framework::kEmptyVarName) {
+          continue;
+        }
+        bwd_skip_vars.insert(igs[i]);
+        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
       }
     }
+    while_grad->SetAttr(
+        kSkipEagerDeletionVars,
+        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
new file mode 100644
index 0000000000..7ec1f0ae75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
new file mode 100644
index 0000000000..2dcdbdb8f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_seresnext import TestResnet
+
+
+class EagerDeletionTestSEResNext(TestResnet):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
new file mode 100644
index 0000000000..754d5fd409
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From d0c8b9b9b350f774a7b195bf6c807b90b5f895f9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 12:00:28 +0000
Subject: [PATCH 06/14] remove timeout unittest test=develop

---
 paddle/fluid/framework/tensor.h               |  2 +-
 .../test_eager_deletion_seresnext.py          | 27 -------------------
 2 files changed, 1 insertion(+), 28 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 3a4c52410e..71e8badd4b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -153,7 +153,7 @@ class Tensor {
 
   void set_layout(const DataLayout layout) { layout_ = layout; }
 
-  void clear() { holder_.reset(); }
+  void clear() { holder_ = nullptr; }
 
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
deleted file mode 100644
index 2dcdbdb8f1..0000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
-
-from test_parallel_executor_seresnext import TestResnet
-
-
-class EagerDeletionTestSEResNext(TestResnet):
-    pass
-
-
-if __name__ == '__main__':
-    unittest.main()

From 387bac46b5e4d95e2888773975d1b6c3a906a588 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:09:43 +0000
Subject: [PATCH 07/14] refine code test=develop

---
 .../framework/details/eager_deletion_pass.cc  |  10 +-
 .../fluid/framework/details/op_graph_view.cc  |   2 +
 .../framework/details/reference_count_pass.cc |  14 +-
 .../details/reference_count_pass_helper.h     |  10 +-
 .../scope_buffered_ssa_graph_executor.cc      |   8 +-
 .../scope_buffered_ssa_graph_executor.h       |   2 +-
 paddle/fluid/framework/executor.cc            |  14 +-
 paddle/fluid/framework/executor.h             |   6 +-
 paddle/fluid/framework/parallel_executor.cc   | 153 ++++++++++--------
 .../fluid/operators/controlflow/while_op.cc   |  10 +-
 10 files changed, 122 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 3a1b37e533..85991c71e6 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -31,10 +31,11 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
   const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
   auto &ref_cnts =
-      Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
   const auto &last_live_ops =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-  auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
+  auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
 
   ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
 
@@ -58,7 +59,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
         graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
     auto *eager_deletion_op = new EagerDeletionOpHandle(
         eager_deletion_node, op->GetScope(), op->GetPlace(),
-        std::move(var_names), gcs[op->GetScopeIdx()].get(),
+        std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(),
         &(ref_cnts[op->GetScopeIdx()]));
 
     auto it = std::find_if(
@@ -90,6 +91,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
 
 REGISTER_PASS(eager_deletion_pass,
               paddle::framework::details::EagerDeletionPass)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
     .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
     .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index 4838c4198f..b6b5ad42c4 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }
 
 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
   for (auto &op : ops) {
     preceding_ops_[op];
     pending_ops_[op];
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 0c096e0980..f2c9dfb524 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,22 +29,22 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class OpConnectionDetector {
+class OpRelationDetector {
  public:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
-  explicit OpConnectionDetector(const std::vector<OpHandleBase *> &all_ops)
+  explicit OpRelationDetector(const std::vector<OpHandleBase *> &all_ops)
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  OpSet MaxNoDepOps(const OpSet &op_set) {
-    if (op_set.size() <= 1) return op_set;
+  OpSet MaxNoDepOps(const OpSet &op_set) const {
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
                         typename std::remove_pointer<KeyType>::type>::value,
-        "Key type of OpSet must be or derived of OpHandleBase");
+        "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase");
 
+    if (op_set.size() <= 1) return op_set;
     std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
     OpSet ret;
     auto rels = GetRelations(ops);
@@ -59,7 +59,7 @@ class OpConnectionDetector {
 
  private:
   std::vector<std::vector<RelationShip>> GetRelations(
-      const std::vector<OpHandleBase *> ops) {
+      const std::vector<OpHandleBase *> ops) const {
     std::unordered_map<OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
       PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
@@ -144,7 +144,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
   ref_cnts = std::vector<ReferenceCountMap>(vars.size());
 
-  OpConnectionDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+  OpRelationDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
 
   for (size_t i = 0; i < vars.size(); ++i) {
     for (auto &name_var_pair : vars[i]) {
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
index 77846f7bdf..eb534f9701 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <atomic>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -33,12 +34,13 @@ using ReferenceCountMap = std::unordered_map<std::string, size_t>;
 using AtomicReferenceCountMap =
     std::unordered_map<std::string, std::atomic<size_t>>;
 
-using GarbageCollectorList =
-    std::vector<std::unique_ptr<GarbageCollector<Tensor>>>;
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector<Tensor>>>;
 
-const char kGlobalReferenceCount[] = "reference_count";
-const char kCurReferenceCount[] = "current_reference_count";
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
 const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
 
 using LastLiveOpsOfVars =
     std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index da5e277f27..b8775fc329 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -32,15 +32,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       var_infos_(std::move(var_infos)),
       places_(std::move(places)) {
   if (Graph().Has(details::kGarbageCollector)) {
-    gc_ = &(Graph().Get<GarbageCollectorList>(details::kGarbageCollector));
+    gc_ = &(Graph().Get<GarbageCollectorMap>(details::kGarbageCollector));
   }
 }
 
 void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
   if (gc_) {
-    for (auto &gc : *gc_) {
-      gc->Wait();
-      gc->Reset();
+    for (auto &gc_pair : *gc_) {
+      gc_pair.second->Wait();
+      gc_pair.second->Reset();
     }
   }
 }
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 4d52183a20..6086a219e0 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -60,7 +60,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 
-  GarbageCollectorList* gc_{nullptr};
+  GarbageCollectorMap* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index f443c2d8cf..04425a5983 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -56,13 +56,7 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
             type != proto::VarType::LOD_TENSOR_ARRAY) {
           continue;
         }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
+        ++ref_cnts[name];
       }
     }
   };
@@ -79,8 +73,8 @@ ExecutorPrepareContext::ExecutorPrepareContext(
     const std::vector<std::string>& skip_ref_cnt_vars)
     : prog_(prog), block_id_(block_id) {
   if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
-                                                 skip_ref_cnt_vars);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
   }
 }
 
@@ -443,7 +437,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
     if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
     }
   }
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 412ebd1904..5a040ac641 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -34,14 +34,14 @@ struct ExecutorPrepareContext {
 
   ~ExecutorPrepareContext();
 
-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
 
   const framework::ProgramDesc& prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
-  std::unordered_map<std::string, size_t> ref_cnts_;
-  std::unordered_map<std::string, size_t> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };
 
 class Executor {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 3d466e44a1..dfd031f119 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -51,11 +51,22 @@ class ParallelExecutorPrivate {
     }
   }
 
-  void ResetRuntimeReferenceCount() {
-    for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) {
-      for (auto &pair : rt_ref_cnts_[i]) {
-        rt_cur_ref_cnts_[i][pair.first] = pair.second;
+  std::unique_ptr<ir::Graph> PrepareGCAndRefCnts(
+      std::unique_ptr<ir::Graph> graph, size_t max_memory_size);
+
+  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
+
+  void ResetRuntimeReferenceCount(const std::vector<std::string> &fetch_tensors,
+                                  const std::string &fetched_var_name) {
+    for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) {
+      for (auto &pair : global_ref_cnts_[i]) {
+        runtime_ref_cnts_[i][pair.first] = pair.second;
+      }
+
+      for (auto &fetch_name : fetch_tensors) {
+        runtime_ref_cnts_[i].erase(fetch_name);
       }
+      runtime_ref_cnts_[i].erase(fetched_var_name);
     }
   }
 
@@ -71,14 +82,75 @@ class ParallelExecutorPrivate {
   bool use_cuda_;
   bool use_all_reduce_;
 
-  // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_
-  std::vector<details::ReferenceCountMap> rt_ref_cnts_;
-  std::vector<details::AtomicReferenceCountMap> rt_cur_ref_cnts_;
-  details::GarbageCollectorList gcs_;
+  // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
+  // then keeps unchanged
+  // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_
+  std::vector<details::ReferenceCountMap> global_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> runtime_ref_cnts_;
+  details::GarbageCollectorMap gcs_;
 };
 
+std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
+    std::unique_ptr<ir::Graph> graph, size_t max_memory_size) {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &place = places_[i];
+    if (gcs_.count(place) > 0) {
+      continue;
+    }
+#ifdef PADDLE_WITH_CUDA
+    GarbageCollector<Tensor> *gc = nullptr;
+    if (platform::is_gpu_place(place)) {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc = new UnsafeFastGPUGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size);
+      } else {
+        gc = new StreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size);
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+    } else if (platform::is_cpu_place(place)) {
+#endif
+      gc = new CPUGarbageCollector<Tensor>(
+          boost::get<platform::CPUPlace>(place), max_memory_size);
+      VLOG(10) << "Created GarbageCollector at " << place;
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    if (gc) {
+      gcs_[place] = std::unique_ptr<GarbageCollector<Tensor>>(gc);
+    }
+  }
+
+  if (gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &global_ref_cnts_);
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount,
+                                     &runtime_ref_cnts_);
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+
+    graph->SetNotOwned(details::kGarbageCollector, &gcs_);
+  }
+
+  return graph;
+}
+
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
@@ -153,54 +225,8 @@ ParallelExecutor::ParallelExecutor(
 
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    size_t place_num = member_->places_.size();
-    for (size_t i = 0; i < place_num; ++i) {
-      auto &place = member_->places_[i];
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(place)) {
-        if (IsFastEagerDeletionModeEnabled()) {
-          member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector<Tensor>(
-              boost::get<platform::CUDAPlace>(place), max_memory_size));
-        } else {
-          member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
-              boost::get<platform::CUDAPlace>(place), max_memory_size));
-        }
-        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-      } else if (platform::is_cpu_place(place)) {
-#endif
-        member_->gcs_.emplace_back(new CPUGarbageCollector<Tensor>(
-            boost::get<platform::CPUPlace>(place), max_memory_size));
-        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
-    }
-  }
-
-  if (!member_->gcs_.empty()) {
-    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
-
-    auto ref_cnt_pass =
-        ir::PassRegistry::Instance().Get("reference_count_pass");
-    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
-                              &(member_->rt_ref_cnts_));
-    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
-                              &last_live_ops_of_vars);
-    graph = ref_cnt_pass->Apply(std::move(graph));
-    VLOG(10) << "ReferenceCountPass Applied";
-
-    auto eager_deletion_pass =
-        ir::PassRegistry::Instance().Get("eager_deletion_pass");
-    eager_deletion_pass->SetNotOwned(details::kCurReferenceCount,
-                                     &(member_->rt_cur_ref_cnts_));
-    eager_deletion_pass->SetNotOwned(details::kGarbageCollector,
-                                     &(member_->gcs_));
-    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
-                                     &last_live_ops_of_vars);
-    graph = eager_deletion_pass->Apply(std::move(graph));
-    VLOG(10) << "EagerDeletionPass Applied";
-
-    graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_));
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
@@ -316,15 +342,8 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  if (!member_->gcs_.empty()) {
-    member_->ResetRuntimeReferenceCount();
-    size_t n = member_->rt_ref_cnts_.size();
-    for (size_t i = 0; i < n; ++i) {
-      for (auto &fetch_name : fetch_tensors) {
-        member_->rt_cur_ref_cnts_[i].erase(fetch_name);
-      }
-      member_->rt_cur_ref_cnts_[i].erase(fetched_var_name);
-    }
+  if (member_->HasGarbageCollectors()) {
+    member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
   }
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index da7cad82d8..06920a47ee 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -74,9 +74,7 @@ class WhileOp : public framework::OperatorBase {
 
     bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-    }
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
@@ -144,9 +142,7 @@ class WhileGradOp : public framework::OperatorBase {
     auto *program = block->Program();
 
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-    }
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
@@ -369,7 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The followi_ng codes are used in eager deletion mode */
+    /* The following codes are used in eager deletion mode */
     std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
       std::unordered_set<std::string> fwd_skip_vars;

From 644baa2e45b64f5a52e237ca1981cb30a5043e0c Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:30:17 +0000
Subject: [PATCH 08/14] fix code bug in CPU compilation test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfd031f119..fd2bcb8848 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,8 +97,8 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-#ifdef PADDLE_WITH_CUDA
     GarbageCollector<Tensor> *gc = nullptr;
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = new UnsafeFastGPUGarbageCollector<Tensor>(

From 8095fb5e686d3e32f1838dfe7fbf4d0108ef1f25 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:30:17 +0000
Subject: [PATCH 09/14] fix code bug in CPU compilation test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfd031f119..e51b1f1f73 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,8 +97,8 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-#ifdef PADDLE_WITH_CUDA
     GarbageCollector<Tensor> *gc = nullptr;
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = new UnsafeFastGPUGarbageCollector<Tensor>(
@@ -122,7 +122,7 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     }
   }
 
-  if (gcs_.empty()) {
+  if (!gcs_.empty()) {
     std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
 
     auto ref_cnt_pass =

From eb8252466b11bdbea7abca6fd4cc5816f1c30830 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 09:15:23 +0000
Subject: [PATCH 10/14] polish code add unittest model containing while_op
 remove unnecessary codes test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |   4 +-
 paddle/fluid/framework/details/CMakeLists.txt |   5 +-
 .../details/eager_deletion_op_handle.cc       |  48 +++---
 .../details/eager_deletion_op_handle.h        |   8 +-
 .../framework/details/eager_deletion_pass.cc  |  18 +-
 .../fluid/framework/details/op_graph_view.cc  |   1 +
 .../framework/details/reference_count_pass.cc | 156 +++++++++++-------
 .../details/reference_count_pass_helper.cc    |  21 +++
 .../details/reference_count_pass_helper.h     |   4 +-
 .../scope_buffered_ssa_graph_executor.cc      |  21 +--
 .../scope_buffered_ssa_graph_executor.h       |   6 -
 paddle/fluid/framework/executor.cc            |  56 ++++---
 paddle/fluid/framework/garbage_collector.cc   |  89 ++++++++++
 paddle/fluid/framework/garbage_collector.h    | 153 ++++++-----------
 paddle/fluid/framework/parallel_executor.cc   |  28 ++--
 paddle/fluid/framework/scope.cc               |   2 +-
 paddle/fluid/framework/tensor.h               |   4 +
 .../unittests/test_eager_deletion_gru_net.py  |  49 ++++++
 .../unittests/test_eager_deletion_lstm_net.py | 111 +++++++++++++
 19 files changed, 516 insertions(+), 268 deletions(-)
 create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.cc
 create mode 100644 paddle/fluid/framework/garbage_collector.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c701a2ad63..f2361c5cea 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
@@ -164,7 +166,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8049f5d3f7..a6c8ef408a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -33,9 +33,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 
-cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 54715fed8d..3b27415e43 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -26,8 +26,8 @@ namespace details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node, const Scope *scope, const platform::Place &place,
-    const std::unordered_set<std::string> &var_names,
-    GarbageCollector<Tensor> *gc, AtomicReferenceCountMap *ref_cnts)
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
     : OpHandleBase(node),
       scope_(scope),
       var_names_(var_names),
@@ -35,9 +35,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place)) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
-    if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(
           boost::get<platform::CUDAPlace>(place).device);
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -61,10 +61,11 @@ std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
 void EagerDeletionOpHandle::RunImpl() {
   auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  std::vector<Tensor *> tensors;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
   for (auto &name : var_names_) {
     auto it = ref_cnts_->find(name);
-    if (it == ref_cnts_->end()) {
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
       continue;
     }
 
@@ -73,43 +74,44 @@ void EagerDeletionOpHandle::RunImpl() {
       continue;
     }
 
+    VLOG(2) << "Erase variable " << name;
+
     if (var->IsType<LoDTensor>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>());
-      }
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
     } else if (var->IsType<SelectedRows>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        tensors.emplace_back(var->GetMutable<SelectedRows>()->mutable_value());
-      }
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
     } else if (var->IsType<LoDTensorArray>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        auto *tensor_arr = var->GetMutable<LoDTensorArray>();
-        for (auto &t : *tensor_arr) {
-          tensors.emplace_back(&t);
-        }
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemory());
       }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
     }
   }
 
-  if (!tensors.empty()) {
-    ClearTensors(tensors);
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
   }
 }
 
-void EagerDeletionOpHandle::ClearTensors(const std::vector<Tensor *> &tensors) {
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
 #ifdef PADDLE_WITH_CUDA
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
-        static_cast<StreamGarbageCollector<Tensor> *>(gc_)->stream();
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
       PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
     };
-    gc_->Add(tensors, callback_func);
+    gc_->Add(std::move(*garbages), callback_func);
   } else {
 #endif
-    gc_->Add(tensors);
+    gc_->Add(std::move(*garbages));
 #ifdef PADDLE_WITH_CUDA
   }
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index d8de59cc4d..64867afad5 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include <deque>
 #include <string>
-#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 
@@ -30,7 +30,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
                         const platform::Place &place,
                         const std::unordered_set<std::string> &var_names,
-                        GarbageCollector<Tensor> *gc,
+                        GarbageCollector *gc,
                         AtomicReferenceCountMap *ref_cnts);
 
   ~EagerDeletionOpHandle();
@@ -41,11 +41,11 @@ class EagerDeletionOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  void ClearTensors(const std::vector<Tensor *> &tensors);
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
 
   const Scope *scope_;
   std::unordered_set<std::string> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
+  GarbageCollector *gc_;               // not own
   AtomicReferenceCountMap *ref_cnts_;  // not own
 #ifdef PADDLE_WITH_CUDA
   platform::CUDADeviceContext *dev_ctx_{nullptr};
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 85991c71e6..4e42d0b497 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -28,17 +28,21 @@ namespace details {
 
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  const auto &vars = graph->Get<GraphVars>(kGraphVars);
-
   auto &ref_cnts =
       Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
   const auto &last_live_ops =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-  auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
   const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
 
-  ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
-
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
   std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
       op_vars_map;
 
@@ -58,8 +62,8 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     auto *eager_deletion_node =
         graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
     auto *eager_deletion_op = new EagerDeletionOpHandle(
-        eager_deletion_node, op->GetScope(), op->GetPlace(),
-        std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(),
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
         &(ref_cnts[op->GetScopeIdx()]));
 
     auto it = std::find_if(
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index b6b5ad42c4..d3865c2c29 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -42,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
 
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
   std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
   for (auto &pair : preceding_ops_) {
     ret.insert(pair.first);
   }
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index f2c9dfb524..13a042d8e6 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,15 +29,17 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class OpRelationDetector {
- public:
+// A functor to shrink/remove operators who depend on other operators in a set
+class ShrinkDepsOpFunctor {
+ private:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
-  explicit OpRelationDetector(const std::vector<OpHandleBase *> &all_ops)
+ public:
+  explicit ShrinkDepsOpFunctor(const std::vector<OpHandleBase *> &all_ops)
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  OpSet MaxNoDepOps(const OpSet &op_set) const {
+  OpSet operator()(const OpSet &op_set) const {
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
@@ -51,7 +53,7 @@ class OpRelationDetector {
     auto not_before = [](RelationShip r) { return r != kBefore; };
     for (size_t i = 0; i < rels.size(); ++i) {
       if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
-        ret.insert(static_cast<KeyType>(ops[i]));
+        ret.emplace(static_cast<KeyType>(ops[i]));
       }
     }
     return ret;
@@ -59,7 +61,7 @@ class OpRelationDetector {
 
  private:
   std::vector<std::vector<RelationShip>> GetRelations(
-      const std::vector<OpHandleBase *> ops) const {
+      const std::vector<OpHandleBase *> &ops) const {
     std::unordered_map<OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
       PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
@@ -112,6 +114,10 @@ class OpRelationDetector {
   const OpGraphView graph_;
 };
 
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
 static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
     OpHandleBase *op, size_t scope_idx) {
   std::queue<OpHandleBase *> q;
@@ -134,33 +140,87 @@ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
   return nullptr;
 }
 
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     bool *ok) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
+    } else {
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *ok = false;
+      return {};
+    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*ok=false) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *ok = false;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
+  }
+
+  // stage three. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *ok = true;
+  return shrink_func(computation_op);
+}
+
+static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
+}
+
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &vars = graph->Get<GraphVars>(kGraphVars);
   auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
   auto &last_live_ops_of_vars =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
 
-  last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
-  ref_cnts = std::vector<ReferenceCountMap>(vars.size());
+  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(),
+                 "Last Live Ops and Reference Counts of vars should be "
+                 "initialized at here.");
 
-  OpRelationDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
-  for (size_t i = 0; i < vars.size(); ++i) {
-    for (auto &name_var_pair : vars[i]) {
-      if (name_var_pair.second.empty()) {
-        continue;
-      }
+  last_live_ops_of_vars.resize(vars.size());
+  ref_cnts.resize(vars.size());
 
-      const std::string &var_name = name_var_pair.first;
-      auto *last_ver_var = name_var_pair.second.back();
+  ShrinkDepsOpFunctor shrink_func(
+      ir::FilterByNodeWrapper<OpHandleBase>(*graph));
 
-      VarDesc *var_desc = nullptr;
-      std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(),
-                   [&](VarHandle *var_handle) -> bool {
-                     var_desc = var_handle->Node()->Var();
-                     return var_desc != nullptr;
-                   });
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      // Whether this variable can be reused or deleted? If not, we do not
+      // compute reference counts and dependencies.
+      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
 
       if (var_desc == nullptr || var_desc->Persistable()) {
         continue;
@@ -170,50 +230,20 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       if (var_type != proto::VarType::LOD_TENSOR &&
           var_type != proto::VarType::SELECTED_ROWS &&
           var_type != proto::VarType::LOD_TENSOR_ARRAY) {
+        // Var type cannot be deleted
         continue;
       }
 
-      std::unordered_set<ComputationOpHandle *> last_live_op;
-      auto add_last_live_op = [&](OpHandleBase *op) -> bool {
-        auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
-        if (compute_op) {
-          last_live_op.insert(compute_op);
-          return true;
-        } else {
-          return false;
-        }
-      };
-
-      bool can_delete = false;
-      auto &pending_ops = last_ver_var->PendingOps();
-      if (pending_ops.empty()) {
-        auto *generated_op = last_ver_var->GeneratedOp();
-        if (generated_op && add_last_live_op(generated_op)) {
-          can_delete = true;
-        }
-      } else {
-        can_delete = true;
-        for (auto *pending_op : pending_ops) {
-          if (!add_last_live_op(pending_op)) {
-            can_delete = false;
-            break;
-          }
-        }
-      }
-
-      if (can_delete) {
-        size_t original_size = last_live_op.size();
-        last_live_op = detector.MaxNoDepOps(last_live_op);
-        if (last_live_op.size() != original_size) {
-          VLOG(10) << "Shrink last living op number of " << var_name << " from "
-                   << original_size << " to " << last_live_op.size();
-        }
-
-        PADDLE_ENFORCE(!last_live_op.empty(),
-                       "Last living ops of %s cannot be empty", var_name);
+      bool ok;
+      auto result = ExtractComputationOpFromLastLivedVar(
+          name_var_pair.second.back(), i, shrink_func, &ok);
 
-        ref_cnts[i].emplace(var_name, last_live_op.size());
-        last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      if (ok) {
+        auto &var_name = name_var_pair.first;
+        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
+                       var_name);
+        ref_cnts[i].emplace(var_name, result.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
       }
     }
   }
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc
new file mode 100644
index 0000000000..89bd08c2d0
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
index eb534f9701..1c083dbf00 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -18,10 +18,10 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -35,7 +35,7 @@ using AtomicReferenceCountMap =
     std::unordered_map<std::string, std::atomic<size_t>>;
 
 using GarbageCollectorMap =
-    std::map<platform::Place, std::unique_ptr<GarbageCollector<Tensor>>>;
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
 
 const char kGlobalReferenceCount[] = "global_reference_count";
 const char kRuntimeReferenceCount[] = "runtime_reference_count";
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index b8775fc329..57f6fc66c5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -30,20 +30,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       var_infos_(std::move(var_infos)),
-      places_(std::move(places)) {
-  if (Graph().Has(details::kGarbageCollector)) {
-    gc_ = &(Graph().Get<GarbageCollectorMap>(details::kGarbageCollector));
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
-  if (gc_) {
-    for (auto &gc_pair : *gc_) {
-      gc_pair.second->Wait();
-      gc_pair.second->Reset();
-    }
-  }
-}
+      places_(std::move(places)) {}
 
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -83,19 +70,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (auto &p : places_) {
+    for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
-    WaitAllGarbageCollectors();
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
-  } else {
-    WaitAllGarbageCollectors();
   }
-
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 6086a219e0..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -21,11 +21,9 @@
 #include "paddle/fluid/framework/details/var_handle.h"
 
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -50,8 +48,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
-  void WaitAllGarbageCollectors();
-
   size_t drop_scope_counter_{0};
 
   ExecutionStrategy strategy_;
@@ -59,8 +55,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
-
-  GarbageCollectorMap* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 04425a5983..767bbb524f 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
+#include <deque>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@@ -83,31 +84,37 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 }
 
 static void DeleteUnusedTensors(
-    const Scope& scope, const OperatorBase* op, GarbageCollector<Tensor>* gc,
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
     std::unordered_map<std::string, size_t>* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
 
   auto handler = [&](const VariableNameMap& name_map) {
     for (auto& name_pair : name_map) {
       for (auto& name : name_pair.second) {
         auto it = ref_cnts->find(name);
         if (it == ref_cnts->end()) continue;
-        if (--(it->second) == 0) {
-          auto* var = scope.FindVar(name);
-          if (var != nullptr) {
-            VLOG(2) << "Erase tensor \'" << name << "\'";
-            if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
-            } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            } else if (var->IsType<LoDTensorArray>()) {
-              auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-              for (auto& t : *lod_tensor_arr) {
-                erase_tensors.insert(&t);
-              }
-            }
+        if (--(it->second) != 0) {
+          continue;
+        }
+        auto* var = scope.FindVar(name);
+        if (var != nullptr) {
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
+        if (var->IsType<LoDTensor>()) {
+          garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(
+              var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemory());
           }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
         }
       }
     }
@@ -116,8 +123,8 @@ static void DeleteUnusedTensors(
   handler(op->Inputs());
   handler(op->Outputs());
 
-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
   }
 }
 
@@ -411,22 +418,22 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
+  std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector<Tensor>(
+        gc.reset(new UnsafeFastGPUGarbageCollector(
             boost::get<platform::CUDAPlace>(place_), max_memory_size));
       } else {
-        gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+        gc.reset(new DefaultStreamGarbageCollector(
             boost::get<platform::CUDAPlace>(place_), max_memory_size));
       }
     } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
@@ -442,7 +449,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (gc) gc->Wait();
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
new file mode 100644
index 0000000000..54d9d0dc01
--- /dev/null
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 1382e0d461..2768671029 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -14,160 +14,83 @@
 
 #pragma once
 
-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
  public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
 
-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);
 
-  size_t NumOfGarbages() const {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return garbages_->size();
-  }
+  virtual ~GarbageCollector() = default;
 
-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual void Wait() const {}
 
   template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);
 
   template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::deque<T *> *clear_deque = nullptr;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_.release();
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([clear_deque]() {
-        for (auto *obj : *clear_deque) obj->clear();
-        delete clear_deque;
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);
 
  protected:
   virtual void ClearCallback(const std::function<void()> &callback) = 0;
 
   platform::DeviceContext *dev_ctx_;
-  std::unique_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
   mutable std::mutex mutex_;
   const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };
 
-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
  public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class UnsafeFastGPUGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+                                size_t max_memory_size);
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class DefaultStreamGarbageCollector : public GarbageCollector {
  public:
   DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+                                size_t max_memory_size);
 
-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
-
-  void Wait() const override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+  void Wait() const override;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
  public:
   StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    platform::CUDADeviceGuard guard(place.device);
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);
 
-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    platform::CUDADeviceGuard guard(place.device);
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();
 
-  void Wait() const override { callback_manager_->Wait(); }
+  void Wait() const override;
 
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 
  private:
   cudaStream_t stream_;
@@ -175,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif
 
+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e51b1f1f73..7458b69af8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,29 +97,31 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-    GarbageCollector<Tensor> *gc = nullptr;
+    std::unique_ptr<GarbageCollector> gc;
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
-        gc = new UnsafeFastGPUGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size);
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
       } else {
-        gc = new StreamGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size);
+        gc.reset(new StreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
       }
       VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-    } else if (platform::is_cpu_place(place)) {
+    } else {
 #endif
-      gc = new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place), max_memory_size);
-      VLOG(10) << "Created GarbageCollector at " << place;
+      if (platform::is_cpu_place(place)) {
+        gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place),
+                                         max_memory_size));
+        VLOG(10) << "Created GarbageCollector at " << place;
+      } else {
+        PADDLE_THROW("Unsupported place for garbage collection");
+      }
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
 
-    if (gc) {
-      gcs_[place] = std::unique_ptr<GarbageCollector<Tensor>>(gc);
-    }
+    gcs_.emplace(place, std::move(gc));
   }
 
   if (!gcs_.empty()) {
@@ -144,8 +146,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
-
-    graph->SetNotOwned(details::kGarbageCollector, &gcs_);
   }
 
   return graph;
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index cb3b6cdc3e..6fa5e99f9f 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,7 +38,7 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
-DEFINE_bool(fast_eager_deletion_mode, true,
+DEFINE_bool(fast_eager_deletion_mode, false,
             "Fast eager deletion mode. If enabled, memory would release "
             "immediately without waiting GPU kernel ends.");
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b..9f7027f5ae 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,6 +158,10 @@ class Tensor {
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
 
+  std::shared_ptr<memory::Allocation> MoveMemory() {
+    return std::move(holder_);
+  }
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
new file mode 100644
index 0000000000..1ec174544c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_eager_deletion_lstm_net import TestBase
+import paddle.fluid as fluid
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class GRUTest(TestBase):
+    def setUp(self):
+        self.net = gru_net
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
new file mode 100644
index 0000000000..431765bff2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = lstm_net
+
+    def test_network(self):
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2c6159a151d573ca697e2dfd591720cc854b4b9b Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 13:59:36 +0000
Subject: [PATCH 11/14] fix unittest fix cmake test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 .../test_eager_deletion_dynamic_rnn_base.py   | 86 +++++++++++++++++++
 .../unittests/test_eager_deletion_gru_net.py  |  2 +-
 .../unittests/test_eager_deletion_lstm_net.py | 67 +--------------
 4 files changed, 92 insertions(+), 67 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index f2361c5cea..b236eef3ce 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -171,9 +171,9 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
   if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper garbage_collector)
   else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper garbage_collector)
   endif(NOT WIN32)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
new file mode 100644
index 0000000000..e91cfe0b45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = None
+
+    def test_network(self):
+        if self.net is None:
+            return
+
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
index 1ec174544c..5ed3d9fdf3 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-from test_eager_deletion_lstm_net import TestBase
+from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
index 431765bff2..8462c06aa5 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -12,60 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['CPU_NUM'] = '2'
-
-import six
-import unittest
-
-import paddle
-import paddle.fluid.core as core
+from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
-
-
-def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
-    if use_cuda and not core.is_compiled_with_cuda():
-        print('Skip use_cuda=True because Paddle is not compiled with cuda')
-        return
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    train_reader = paddle.batch(
-        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    cost = network(data, label, len(word_dict))
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
-    optimizer.minimize(cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    reader = feeder.decorate_reader(
-        train_reader, multi_devices=use_parallel_executor)
-
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if use_parallel_executor:
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda, loss_name=cost.name)
-        fetch_list = [cost.name]
-    else:
-        train_exe = exe
-        fetch_list = [cost]
-
-    for pass_id in six.moves.xrange(pass_num):
-        batch_id = 0
-        for data in reader():
-            train_exe.run(feed=data,
-                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
-            batch_id += 1
-            if batch_id > 16:
-                break
+import unittest
 
 
 def lstm_net(data,
@@ -92,20 +41,10 @@ def lstm_net(data,
     return avg_cost
 
 
-class TestBase(unittest.TestCase):
+class LSTMTest(TestBase):
     def setUp(self):
         self.net = lstm_net
 
-    def test_network(self):
-        for use_cuda in [True, False]:
-            for use_parallel_executor in [False, True]:
-                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
-                      format(self.net.__name__, use_cuda,
-                             use_parallel_executor))
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(core.Scope()):
-                        train(self.net, use_cuda, use_parallel_executor)
-
 
 if __name__ == "__main__":
     unittest.main()

From 06f8aa5b97be564b878848acd216069e23081300 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 12 Dec 2018 03:08:21 +0000
Subject: [PATCH 12/14] remove while_op support temporarily test=develop

---
 paddle/fluid/framework/executor.cc            |  3 +-
 .../fluid/operators/controlflow/while_op.cc   | 46 +------------------
 2 files changed, 3 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 767bbb524f..7eab876015 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -419,7 +419,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
-  if (max_memory_size >= 0) {
+  // skip while_op and while_grad_op temporarily
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 06920a47ee..5ab0918c48 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,51 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
-    std::unordered_set<std::string> bwd_skip_vars;
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> fwd_skip_vars;
-      for (auto *op_desc : grad_block->AllOps()) {
-        auto skippable = [&](const std::string &name) {
-          return !grad_block->HasVar(name) &&
-                 (fwd_block->HasVarRecursive(name) ||
-                  parent_block->HasVarRecursive(name));
-        };
-        for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          if (skippable(in_arg_name)) {
-            fwd_skip_vars.insert(in_arg_name);
-          }
-        }
-
-        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-          if (skippable(out_arg_name)) {
-            fwd_skip_vars.insert(out_arg_name);
-          }
-        }
-      }
-
-      if (!fwd_skip_vars.empty()) {
-        // FIXME(zjl): ugly const_cast here, maybe we should find a better way
-        // to modify forward while_op
-        auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
-                             std::vector<std::string>(fwd_skip_vars.begin(),
-                                                      fwd_skip_vars.end()));
-      }
-
-      // Find backward skip vars
-      auto fwd_input = Input(kX);
-      for (size_t i = 0; i < igs.size(); ++i) {
-        if (igs[i] == framework::kEmptyVarName) {
-          continue;
-        }
-        bwd_skip_vars.insert(igs[i]);
-        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
-      }
-    }
-    while_grad->SetAttr(
-        kSkipEagerDeletionVars,
-        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
+    while_grad->SetAttr(kSkipEagerDeletionVars, std::vector<std::string>());
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }

From e82772f42518f1cff790ac04aa1c73c2e5b201e9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 12 Dec 2018 09:22:44 +0000
Subject: [PATCH 13/14] fix cmake conflict test=develop

---
 paddle/fluid/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b1cfb23f3a..6d7a69c8c9 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -169,7 +169,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()

From 2328bee1cc835d789b83cd4da9bef6b588bc87c5 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 13 Dec 2018 06:34:09 +0000
Subject: [PATCH 14/14] fix Windows compile bug test=develop

---
 .../framework/details/eager_deletion_op_handle.cc      |  6 +++---
 paddle/fluid/framework/executor.cc                     | 10 ++++++----
 paddle/fluid/framework/tensor.h                        |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 3b27415e43..abacb11e3b 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -77,14 +77,14 @@ void EagerDeletionOpHandle::RunImpl() {
     VLOG(2) << "Erase variable " << name;
 
     if (var->IsType<LoDTensor>()) {
-      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
     } else if (var->IsType<SelectedRows>()) {
       garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
     } else if (var->IsType<LoDTensorArray>()) {
       auto *tensor_arr = var->GetMutable<LoDTensorArray>();
       for (auto &t : *tensor_arr) {
-        garbages.emplace_back(t.MoveMemory());
+        garbages.emplace_back(t.MoveMemoryHolder());
       }
     } else {
       PADDLE_THROW("Type %s of %s is not supported eager deletion",
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 16c4552a5f..0c4bd336c5 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -106,14 +106,16 @@ static void DeleteUnusedTensors(
 
         VLOG(2) << "Erase variable " << name;
         if (var->IsType<LoDTensor>()) {
-          garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
-        } else if (var->IsType<SelectedRows>()) {
           garbages.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
         } else if (var->IsType<LoDTensorArray>()) {
           auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
           for (auto& t : *lod_tensor_arr) {
-            garbages.emplace_back(t.MoveMemory());
+            garbages.emplace_back(t.MoveMemoryHolder());
           }
         } else {
           PADDLE_THROW("Type %s of %s is not supported eager deletion",
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 9f7027f5ae..153222506a 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,7 +158,7 @@ class Tensor {
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
 
-  std::shared_ptr<memory::Allocation> MoveMemory() {
+  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
     return std::move(holder_);
   }