Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-adam

7 years ago · 4035e4bab2
parent 3dc29b3905 e213050223
commit 4035e4bab2
55 changed files with 2479 additions and 652 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -198,6 +198,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)

@ -183,6 +185,8 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

+target_link_libraries(executor garbage_collector)
+
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -45,10 +45,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s

 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@ -56,10 +56,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 

 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
    : OpHandleBase(node),
      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}

 void ComputationOpHandle::RunImpl() {
  WaitInputVarGenerated(place_);
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);

  std::string Name() const override;

@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {

  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }

+  size_t GetScopeIdx() const { return scope_idx_; }
+
 protected:
  void RunImpl() override;

@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
  std::unique_ptr<OperatorBase> op_;
  Scope *scope_;
  platform::Place place_;
+  size_t scope_idx_;
  bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
+    }
+  }
+#endif
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::CUDADeviceGuard guard(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    VLOG(2) << "Erase variable " << name;
+
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
+    }
+  }
+
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
+  }
+}
+
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(std::move(*garbages), callback_func);
+  } else {
+#endif
+    gc_->Add(std::move(*garbages));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@ -0,0 +1,58 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::unordered_set<std::string> &var_names,
+                        GarbageCollector *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector *gc_;               // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@ -0,0 +1,101 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
+
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
+      }
+    }
+  }
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -565,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                    int dev_id) const {
  result->Get<GraphOps>(kGraphOps).emplace_back(
      new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
  CreateOpHandleIOs(result, node, dev_id);
 }

@ -688,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
    CreateOpHandleIOs(result, node, scope_idx);
  }
 }
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }

 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
  for (auto &op : ops) {
    preceding_ops_[op];
    pending_ops_[op];
@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {

 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
  std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
  for (auto &pair : preceding_ops_) {
    ret.insert(pair.first);
  }
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@ -14,7 +14,7 @@

 #pragma once

-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@ -34,6 +34,11 @@ class OpGraphView {

  bool HasOp(OpHandleBase *op) const;

+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
 private:
  void Build(const std::vector<OpHandleBase *> &ops);
  void EnforceHasOp(OpHandleBase *op) const;
@ -44,6 +49,28 @@ class OpGraphView {
      pending_ops_;
 };

+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
@ -1,138 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@ -14,7 +14,6 @@

 #pragma once

-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
+
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -18,9 +18,6 @@
 #include <vector>
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif

 namespace paddle {
 namespace framework {
@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;

-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
-      }
-#endif
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/executor.h"
+#include <deque>

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@ -41,11 +42,43 @@ namespace {
 int kProgramId = -1;
 }  // namespace

+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+        ++ref_cnts[name];
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
    : prog_(prog), block_id_(block_id) {
  if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
  }
 }

@ -53,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }

-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;

  auto handler = [&](const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        auto it = ref_cnts->find(name);
        if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
+        if (--(it->second) != 0) {
+          continue;
+        }
        auto* var = scope.FindVar(name);
        if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
        if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
+          garbages.emplace_back(
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
        } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            }
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemoryHolder());
          }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
        }
      }
    }
@ -83,8 +128,8 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  handler(op->Inputs());
  handler(op->Outputs());

-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
  }
 }

@ -325,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@ -338,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
  for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
  }
  return result;
 }
@ -365,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }

  int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
+  std::unique_ptr<GarbageCollector> gc;
+  // skip while_op and while_grad_op temporarily
  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
            boost::get<platform::CUDAPlace>(place_), max_memory_size));
      } else {
+        gc.reset(new DefaultStreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
    }
 #endif
@ -389,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);

-    if (gc != nullptr) {
+    if (gc) {
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
    }
  }

-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }

  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -27,52 +27,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
-
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
  ~ExecutorPrepareContext();

-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }

  const framework::ProgramDesc& prog_;
  size_t block_id_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;

-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };

 class Executor {
@ -108,10 +77,14 @@ class Executor {
           const std::string& fetch_holder_name = "fetch");

  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());

  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());

  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
@ -174,6 +175,8 @@ void print_fetch_var(Scope* scope, std::string var_name) {
 }

 void ExecutorThreadWorker::TrainFiles() {
+  platform::SetNumThreads(1);
+
  // todo: configurable
  SetDevice();

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@ -14,7 +14,6 @@

 #pragma once

-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
@ -24,134 +23,74 @@
 namespace paddle {
 namespace framework {

-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
 public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;

-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);

-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual ~GarbageCollector() = default;
+
+  virtual void Wait() const {}

  template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);

  template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_;
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([=]() {
-        for (auto *obj : *clear_deque) obj->clear();
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);

 protected:
  virtual void ClearCallback(const std::function<void()> &callback) = 0;

  platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
  mutable std::mutex mutex_;
  const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };

-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
 public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);

-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};

-  void Wait() const override {
-    this->dev_ctx_->Wait();
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+class DefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);
+
+  void Wait() const override;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
 public:
  StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);

-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();

-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override;

-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;

 private:
  cudaStream_t stream_;
@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif

+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -73,14 +73,21 @@ class Graph {
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  template <typename AttrType>
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                   attr_name);
+    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  template <typename AttrType>
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@ -51,11 +51,18 @@ class Pass {
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                   "%s attr not registered for pass.", attr_name);
+    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  void Erase(const std::string &attr_name) {
--- a/Show More
+++ b/Show More